In [1]:
import pandas as pd
import numpy as np

from datetime import datetime
import requests
import seaborn as sns
sns.set()
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 10
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10

import re

# IQR/Z score.
from scipy.stats import stats

# Seleción de variables.
from sklearn.feature_selection import SelectKBest

# Modelos Selección.
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold

# Modelos.
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeClassifierCV
from sklearn.svm import SVC
from sklearn.model_selection import RepeatedStratifiedKFold

# Preprocesado.
from sklearn.preprocessing import MinMaxScaler

from imblearn.over_sampling import SMOTE
import pickle
# Métricas.
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, auc, confusion_matrix, f1_score, precision_score, recall_score, roc_curve, classification_report
In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

# Leer los archivos CSV
train_values = pd.read_csv("C:/Users/Fran/Desktop/Master/maching learning tarea/4910797b-ee55-40a7-8668-10efd5c1b960.csv")
train_labels = pd.read_csv("C:/Users/Fran/Desktop/Master/maching learning tarea/0bf8bc6e-30d0-4c50-956a-603fc693d966.csv")
test_values = pd.read_csv("C:/Users/Fran/Desktop/Master/maching learning tarea/702ddfc5-68cd-4d1d-a0de-f5f566f76d91.csv")

Cargar los datasets y construir un único tablón analítico con todas las variables¶

In [4]:
print('Tamaño de tabla de train_values:',train_values.shape[0])
print('Tamaño de tabla de train_labels:',train_labels.shape[0])
Tamaño de tabla de train_values: 59400
Tamaño de tabla de train_labels: 59400
In [5]:
train_values.shape
Out[5]:
(59400, 40)
In [6]:
# Unir los datos de entrenamiento y las etiquetas
train = train_values.merge(train_labels, on='id')
In [7]:
train.head()
Out[7]:
id amount_tsh date_recorded funder gps_height installer longitude latitude wpt_name num_private ... water_quality quality_group quantity quantity_group source source_type source_class waterpoint_type waterpoint_type_group status_group
0 69572 6000.0 2011-03-14 Roman 1390 Roman 34.938093 -9.856322 none 0 ... soft good enough enough spring spring groundwater communal standpipe communal standpipe functional
1 8776 0.0 2013-03-06 Grumeti 1399 GRUMETI 34.698766 -2.147466 Zahanati 0 ... soft good insufficient insufficient rainwater harvesting rainwater harvesting surface communal standpipe communal standpipe functional
2 34310 25.0 2013-02-25 Lottery Club 686 World vision 37.460664 -3.821329 Kwa Mahundi 0 ... soft good enough enough dam dam surface communal standpipe multiple communal standpipe functional
3 67743 0.0 2013-01-28 Unicef 263 UNICEF 38.486161 -11.155298 Zahanati Ya Nanyumbu 0 ... soft good dry dry machine dbh borehole groundwater communal standpipe multiple communal standpipe non functional
4 19728 0.0 2011-07-13 Action In A 0 Artisan 31.130847 -1.825359 Shuleni 0 ... soft good seasonal seasonal rainwater harvesting rainwater harvesting surface communal standpipe communal standpipe functional

5 rows × 41 columns

In [8]:
train.shape
Out[8]:
(59400, 41)
In [9]:
## Valores únicos
print('Valores únicos: ',len(train['id'].unique()))
Valores únicos:  59400
In [10]:
import pandas as pd

# Convertir la columna 'date_recorded' a datetime
train['date_recorded'] = pd.to_datetime(train['date_recorded'], format='%Y-%m-%d')

# Extraer día, mes y año en columnas separadas
train['day'] = train['date_recorded'].dt.day
train['month'] = train['date_recorded'].dt.month
train['year'] = train['date_recorded'].dt.year

# Eliminar la columna original 'date_recorded' si ya no es necesaria
train.drop(columns='date_recorded', inplace=True)
In [11]:
train.transpose()
Out[11]:
0 1 2 3 4 5 6 7 8 9 ... 59390 59391 59392 59393 59394 59395 59396 59397 59398 59399
id 69572 8776 34310 67743 19728 9944 19816 54551 53934 46144 ... 13677 44885 40607 48348 11164 60739 27263 37057 31282 26348
amount_tsh 6000.0 0.0 25.0 0.0 0.0 20.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 500.0 10.0 4700.0 0.0 0.0 0.0
funder Roman Grumeti Lottery Club Unicef Action In A Mkinga Distric Coun Dwsp Rwssp Wateraid Isingiro Ho ... Rudep Government Of Tanzania Government Of Tanzania Private World Bank Germany Republi Cefa-njombe NaN Malec World Bank
gps_height 1390 1399 686 263 0 0 0 0 0 0 ... 1715 540 0 0 351 1210 1212 0 0 191
installer Roman GRUMETI World vision UNICEF Artisan DWE DWSP DWE Water Aid Artisan ... DWE Government Government Private ML appro CES Cefa NaN Musa World
longitude 34.938093 34.698766 37.460664 38.486161 31.130847 39.172796 33.36241 32.620617 32.7111 30.626991 ... 31.370848 38.04407 33.00944 33.866852 37.634053 37.169807 35.249991 34.017087 35.861315 38.104048
latitude -9.856322 -2.147466 -3.821329 -11.155298 -1.825359 -4.765587 -3.766365 -4.226198 -5.146712 -1.257051 ... -8.25816 -4.272218 -8.520888 -4.28741 -6.12483 -3.253847 -9.070629 -8.750434 -6.378573 -6.747464
wpt_name none Zahanati Kwa Mahundi Zahanati Ya Nanyumbu Shuleni Tajiri Kwa Ngomho Tushirikiane Kwa Ramadhan Musa Kwapeto ... Kwa Mzee Atanas Kwa Benard Charles Kwa Peter Chimeredya Area Three Namba 27 Kwa Yahona Kuvala Mashine Mshoro Kwa Mzee Lugawa
num_private 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
basin Lake Nyasa Lake Victoria Pangani Ruvuma / Southern Coast Lake Victoria Pangani Internal Lake Tanganyika Lake Tanganyika Lake Victoria ... Lake Tanganyika Pangani Lake Rukwa Internal Wami / Ruvu Pangani Rufiji Rufiji Rufiji Wami / Ruvu
subvillage Mnyusi B Nyamara Majengo Mahakamani Kyanyamisa Moa/Mwereme Ishinabulandi Nyawishi Center Imalauduki Mkonomre ... Kitonto Maore Kati Mbuyuni A Masanga Komstari Kiduruni Igumbilo Madungulu Mwinyi Kikatanyemba
region Iringa Mara Manyara Mtwara Kagera Tanga Shinyanga Shinyanga Tabora Kagera ... Rukwa Kilimanjaro Mbeya Tabora Morogoro Kilimanjaro Iringa Mbeya Dodoma Morogoro
region_code 11 20 21 90 18 4 17 17 14 18 ... 15 3 12 14 5 3 11 12 1 5
district_code 5 2 4 63 1 8 3 3 6 1 ... 2 3 1 2 6 5 4 7 4 2
lga Ludewa Serengeti Simanjiro Nanyumbu Karagwe Mkinga Shinyanga Rural Kahama Tabora Urban Karagwe ... Sumbawanga Rural Same Chunya Igunga Mvomero Hai Njombe Mbarali Chamwino Morogoro Rural
ward Mundindi Natta Ngorika Nanyumbu Nyakasimbi Moa Samuye Chambo Itetemia Kaisho ... Mkowe Maore Mbuyuni Igunga Diongoya Masama Magharibi Ikondo Chimala Mvumi Makulu Ngerengere
population 109 280 250 58 0 1 0 0 0 0 ... 150 210 0 0 89 125 56 0 0 150
public_meeting True NaN True True True True True True True True ... True True True False True True True True True True
recorded_by GeoData Consultants Ltd GeoData Consultants Ltd GeoData Consultants Ltd GeoData Consultants Ltd GeoData Consultants Ltd GeoData Consultants Ltd GeoData Consultants Ltd GeoData Consultants Ltd GeoData Consultants Ltd GeoData Consultants Ltd ... GeoData Consultants Ltd GeoData Consultants Ltd GeoData Consultants Ltd GeoData Consultants Ltd GeoData Consultants Ltd GeoData Consultants Ltd GeoData Consultants Ltd GeoData Consultants Ltd GeoData Consultants Ltd GeoData Consultants Ltd
scheme_management VWC Other VWC VWC NaN VWC VWC NaN VWC NaN ... VWC Water authority VWC Water authority VWC Water Board VWC VWC VWC VWC
scheme_name Roman NaN Nyumba ya mungu pipe scheme NaN NaN Zingibali NaN NaN NaN NaN ... NaN Hingilili NaN NaN NaN Losaa Kia water supply Ikondo electrical water sch NaN NaN NaN
permit False True True True True True True True True True ... False True True False True True True False True True
construction_year 1999 2010 2009 1986 0 2009 0 0 0 0 ... 1991 1967 0 0 2007 1999 1996 0 0 2002
extraction_type gravity gravity gravity submersible gravity submersible swn 80 nira/tanira india mark ii nira/tanira ... swn 80 gravity gravity gravity submersible gravity gravity swn 80 nira/tanira nira/tanira
extraction_type_group gravity gravity gravity submersible gravity submersible swn 80 nira/tanira india mark ii nira/tanira ... swn 80 gravity gravity gravity submersible gravity gravity swn 80 nira/tanira nira/tanira
extraction_type_class gravity gravity gravity submersible gravity submersible handpump handpump handpump handpump ... handpump gravity gravity gravity submersible gravity gravity handpump handpump handpump
management vwc wug vwc vwc other vwc vwc wug vwc vwc ... vwc vwc vwc private operator vwc water board vwc vwc vwc vwc
management_group user-group user-group user-group user-group other user-group user-group user-group user-group user-group ... user-group user-group user-group commercial user-group user-group user-group user-group user-group user-group
payment pay annually never pay pay per bucket never pay never pay pay per bucket never pay unknown never pay never pay ... never pay never pay never pay pay per bucket pay monthly pay per bucket pay annually pay monthly never pay pay when scheme fails
payment_type annually never pay per bucket never pay never pay per bucket never pay unknown never pay never pay ... never pay never pay never pay per bucket monthly per bucket annually monthly never pay on failure
water_quality soft soft soft soft soft salty soft milky salty soft ... soft soft soft soft soft soft soft fluoride soft salty
quality_group good good good good good salty good milky salty good ... good good good good good good good fluoride good salty
quantity enough insufficient enough dry seasonal enough enough enough seasonal enough ... insufficient enough enough insufficient enough enough enough enough insufficient enough
quantity_group enough insufficient enough dry seasonal enough enough enough seasonal enough ... insufficient enough enough insufficient enough enough enough enough insufficient enough
source spring rainwater harvesting dam machine dbh rainwater harvesting other machine dbh shallow well machine dbh shallow well ... machine dbh river spring dam machine dbh spring river machine dbh shallow well shallow well
source_type spring rainwater harvesting dam borehole rainwater harvesting other borehole shallow well borehole shallow well ... borehole river/lake spring dam borehole spring river/lake borehole shallow well shallow well
source_class groundwater surface surface groundwater surface unknown groundwater groundwater groundwater groundwater ... groundwater surface groundwater surface groundwater groundwater surface groundwater groundwater groundwater
waterpoint_type communal standpipe communal standpipe communal standpipe multiple communal standpipe multiple communal standpipe communal standpipe multiple hand pump hand pump hand pump hand pump ... hand pump communal standpipe communal standpipe other communal standpipe communal standpipe communal standpipe hand pump hand pump hand pump
waterpoint_type_group communal standpipe communal standpipe communal standpipe communal standpipe communal standpipe communal standpipe hand pump hand pump hand pump hand pump ... hand pump communal standpipe communal standpipe other communal standpipe communal standpipe communal standpipe hand pump hand pump hand pump
status_group functional functional functional non functional functional functional non functional non functional non functional functional ... functional non functional non functional functional non functional functional functional functional functional functional
day 14 6 25 28 13 13 1 9 3 3 ... 4 3 15 27 9 3 7 11 8 23
month 3 3 2 1 7 3 10 10 11 8 ... 8 8 4 10 3 5 5 4 3 3
year 2011 2013 2013 2013 2011 2011 2012 2012 2012 2011 ... 2011 2013 2011 2012 2011 2013 2011 2011 2011 2011

43 rows × 59400 columns

Pre procesado y limpieza de los datos¶

In [13]:
train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59400 entries, 0 to 59399
Data columns (total 43 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     59400 non-null  int64  
 1   amount_tsh             59400 non-null  float64
 2   funder                 55763 non-null  object 
 3   gps_height             59400 non-null  int64  
 4   installer              55745 non-null  object 
 5   longitude              59400 non-null  float64
 6   latitude               59400 non-null  float64
 7   wpt_name               59398 non-null  object 
 8   num_private            59400 non-null  int64  
 9   basin                  59400 non-null  object 
 10  subvillage             59029 non-null  object 
 11  region                 59400 non-null  object 
 12  region_code            59400 non-null  int64  
 13  district_code          59400 non-null  int64  
 14  lga                    59400 non-null  object 
 15  ward                   59400 non-null  object 
 16  population             59400 non-null  int64  
 17  public_meeting         56066 non-null  object 
 18  recorded_by            59400 non-null  object 
 19  scheme_management      55522 non-null  object 
 20  scheme_name            30590 non-null  object 
 21  permit                 56344 non-null  object 
 22  construction_year      59400 non-null  int64  
 23  extraction_type        59400 non-null  object 
 24  extraction_type_group  59400 non-null  object 
 25  extraction_type_class  59400 non-null  object 
 26  management             59400 non-null  object 
 27  management_group       59400 non-null  object 
 28  payment                59400 non-null  object 
 29  payment_type           59400 non-null  object 
 30  water_quality          59400 non-null  object 
 31  quality_group          59400 non-null  object 
 32  quantity               59400 non-null  object 
 33  quantity_group         59400 non-null  object 
 34  source                 59400 non-null  object 
 35  source_type            59400 non-null  object 
 36  source_class           59400 non-null  object 
 37  waterpoint_type        59400 non-null  object 
 38  waterpoint_type_group  59400 non-null  object 
 39  status_group           59400 non-null  object 
 40  day                    59400 non-null  int32  
 41  month                  59400 non-null  int32  
 42  year                   59400 non-null  int32  
dtypes: float64(3), int32(3), int64(7), object(30)
memory usage: 18.8+ MB
In [14]:
#Valores null
train.isnull().sum()
Out[14]:
id                           0
amount_tsh                   0
funder                    3637
gps_height                   0
installer                 3655
longitude                    0
latitude                     0
wpt_name                     2
num_private                  0
basin                        0
subvillage                 371
region                       0
region_code                  0
district_code                0
lga                          0
ward                         0
population                   0
public_meeting            3334
recorded_by                  0
scheme_management         3878
scheme_name              28810
permit                    3056
construction_year            0
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
management_group             0
payment                      0
payment_type                 0
water_quality                0
quality_group                0
quantity                     0
quantity_group               0
source                       0
source_type                  0
source_class                 0
waterpoint_type              0
waterpoint_type_group        0
status_group                 0
day                          0
month                        0
year                         0
dtype: int64
In [15]:
#División de las variables por numérica o categórica
cat_cols= train.select_dtypes(include=['object','category']).columns
num_cols = train.select_dtypes(exclude=['object','category']).columns
In [16]:
#Vizualización de vairables categóricas
print(train[cat_cols].describe().transpose())
                       count unique                      top   freq
funder                 55763   1896   Government Of Tanzania   9084
installer              55745   2145                      DWE  17402
wpt_name               59398  37399                     none   3563
basin                  59400      9            Lake Victoria  10248
subvillage             59029  19287                 Madukani    508
region                 59400     21                   Iringa   5294
lga                    59400    125                   Njombe   2503
ward                   59400   2092                    Igosi    307
public_meeting         56066      2                     True  51011
recorded_by            59400      1  GeoData Consultants Ltd  59400
scheme_management      55522     11                      VWC  36793
scheme_name            30590   2695                        K    682
permit                 56344      2                     True  38852
extraction_type        59400     18                  gravity  26780
extraction_type_group  59400     13                  gravity  26780
extraction_type_class  59400      7                  gravity  26780
management             59400     12                      vwc  40507
management_group       59400      5               user-group  52490
payment                59400      7                never pay  25348
payment_type           59400      7                never pay  25348
water_quality          59400      8                     soft  50818
quality_group          59400      6                     good  50818
quantity               59400      5                   enough  33186
quantity_group         59400      5                   enough  33186
source                 59400     10                   spring  17021
source_type            59400      7                   spring  17021
source_class           59400      3              groundwater  45794
waterpoint_type        59400      7       communal standpipe  28522
waterpoint_type_group  59400      6       communal standpipe  34625
status_group           59400      3               functional  32259
In [17]:
#Vizualización de vairables numéricas
train[num_cols].describe()
Out[17]:
id amount_tsh gps_height longitude latitude num_private region_code district_code population construction_year day month year
count 59400.000000 59400.000000 59400.000000 59400.000000 5.940000e+04 59400.000000 59400.000000 59400.000000 59400.000000 59400.000000 59400.000000 59400.000000 59400.000000
mean 37115.131768 317.650385 668.297239 34.077427 -5.706033e+00 0.474141 15.297003 5.629747 179.909983 1300.652475 15.621498 4.375640 2011.921667
std 21453.128371 2997.574558 693.116350 6.567432 2.946019e+00 12.236230 17.587406 9.633649 471.482176 951.620547 8.687553 3.029247 0.958758
min 0.000000 0.000000 -90.000000 0.000000 -1.164944e+01 0.000000 1.000000 0.000000 0.000000 0.000000 1.000000 1.000000 2002.000000
25% 18519.750000 0.000000 0.000000 33.090347 -8.540621e+00 0.000000 5.000000 2.000000 0.000000 0.000000 8.000000 2.000000 2011.000000
50% 37061.500000 0.000000 369.000000 34.908743 -5.021597e+00 0.000000 12.000000 3.000000 25.000000 1986.000000 16.000000 3.000000 2012.000000
75% 55656.500000 20.000000 1319.250000 37.178387 -3.326156e+00 0.000000 17.000000 5.000000 215.000000 2004.000000 23.000000 7.000000 2013.000000
max 74247.000000 350000.000000 2770.000000 40.345193 -2.000000e-08 1776.000000 99.000000 80.000000 30500.000000 2013.000000 31.000000 12.000000 2013.000000
In [18]:
#Vizualización de la distribución de los valores de la variable funder
funder_counts = train['funder'].value_counts()

print(funder_counts)
funder
Government Of Tanzania    9084
Danida                    3114
Hesawa                    2202
Rwssp                     1374
World Bank                1349
                          ... 
Muwasa                       1
Msigw                        1
Rc Mofu                      1
Overland High School         1
Samlo                        1
Name: count, Length: 1896, dtype: int64
In [19]:
#Visualización gráfica
import pandas as pd
import matplotlib.pyplot as plt

# Obtener la distribución de la columna 'funder'
funder_counts = train['funder'].value_counts()

# Graficar los valores más comunes
top_funders = funder_counts.head(40)  # Muestra los 40 funders más comunes

# Crear la figura del gráfico
plt.figure(figsize=(10, 8))
top_funders.plot(kind='bar')
plt.title('Top 40 Funders')
plt.xlabel('Funder')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.show()
No description has been provided for this image
In [20]:
# Obtener la distribución de la columna 'funder'
funder_counts = train['funder'].value_counts()

# Filtrar los funders con frecuencia inferior a 30
funders_below_30 = funder_counts[funder_counts < 30]

# Mostrar los funders y sus frecuencias
print(funders_below_30)
funder
World Vision/adra       29
Il                      29
Finwater                29
Cocern                  29
Kidep                   29
                        ..
Muwasa                   1
Msigw                    1
Rc Mofu                  1
Overland High School     1
Samlo                    1
Name: count, Length: 1690, dtype: int64
In [21]:
# Obtener la distribución de la columna 'funder'
funder_counts = train['funder'].value_counts()

# Filtrar los funders con frecuencia inferior a 20
funders_below_20 = funder_counts[funder_counts < 20]

# Mostrar los funders y sus frecuencias
print(funders_below_20)
funder
P                           19
Peace Cope                  19
Kuamu                       19
Makonde Water Population    19
Water Board                 19
                            ..
Muwasa                       1
Msigw                        1
Rc Mofu                      1
Overland High School         1
Samlo                        1
Name: count, Length: 1627, dtype: int64
In [22]:
# Obtener la distribución de la columna 'funder'
funder_counts = train['funder'].value_counts()

# Filtrar los funders con frecuencia inferior a 10
funders_below_10 = funder_counts[funder_counts < 10]

# Mostrar los funders y sus frecuencias
print(funders_below_10)
funder
W.D & I.                9
Council                 9
Cobashec                9
Aimgold                 9
Tassaf Ii               9
                       ..
Muwasa                  1
Msigw                   1
Rc Mofu                 1
Overland High School    1
Samlo                   1
Name: count, Length: 1490, dtype: int64

Se observa una gran cantidad de categorias con valores muy pequeños.

In [24]:
#Valores null en funder
total_nulls = train['funder'].isnull().sum()

print(total_nulls)
3637
In [25]:
import pandas as pd
import numpy as np


# Calcular la proporción de cada categoría sobre el total de observaciones no nulas
total_non_null = train['funder'].dropna().count()
proportions = train['funder'].value_counts() / total_non_null

# Obtener la cantidad de valores nulos
total_nulls = train['funder'].isnull().sum()

# Multiplicar las proporciones por el total de valores nulos
imputed_nulls = np.round(proportions * total_nulls).astype(int)

# Ajustar para asegurar que la suma sea igual al total de valores nulos
imputed_nulls_delta = total_nulls - imputed_nulls.sum()
if imputed_nulls_delta > 0:
    first_non_zero_idx = imputed_nulls[imputed_nulls > 0].index[0]
    imputed_nulls[first_non_zero_idx] += imputed_nulls_delta

# Crear una lista con las categorías según las cantidades calculadas
impute_values = []
for funder, count in imputed_nulls.items():
    impute_values.extend([funder] * count)

# Si hay una discrepancia, ajustar la lista impute_values
if len(impute_values) > total_nulls:
    impute_values = impute_values[:total_nulls]
elif len(impute_values) < total_nulls:
    additional_values = np.random.choice(proportions.index, size=total_nulls - len(impute_values), p=proportions.values)
    impute_values.extend(additional_values)

# Reemplazar los valores nulos en la columna 'funder'
null_indices = train['funder'][train['funder'].isnull()].index
train.loc[null_indices, 'funder'] = impute_values

# Mostrar los resultados
print("Total de valores nulos:", total_nulls)
print("Proporciones de cada categoría:")
print(proportions)
print("Cantidad imputada a cada categoría:")
print(imputed_nulls)
print("Valores imputados:")
print(train['funder'].value_counts())
Total de valores nulos: 3637
Proporciones de cada categoría:
funder
Government Of Tanzania    0.162904
Danida                    0.055843
Hesawa                    0.039489
Rwssp                     0.024640
World Bank                0.024192
                            ...   
Muwasa                    0.000018
Msigw                     0.000018
Rc Mofu                   0.000018
Overland High School      0.000018
Samlo                     0.000018
Name: count, Length: 1896, dtype: float64
Cantidad imputada a cada categoría:
funder
Government Of Tanzania    716
Danida                    203
Hesawa                    144
Rwssp                      90
World Bank                 88
                         ... 
Muwasa                      0
Msigw                       0
Rc Mofu                     0
Overland High School        0
Samlo                       0
Name: count, Length: 1896, dtype: int32
Valores imputados:
funder
Government Of Tanzania    9800
Danida                    3317
Hesawa                    2346
Rwssp                     1464
World Bank                1437
                          ... 
Muwasa                       1
Msigw                        1
Rc Mofu                      1
Overland High School         1
Samlo                        1
Name: count, Length: 1896, dtype: int64
In [26]:
#Valores null tras su ajuste
train.isnull().sum()
Out[26]:
id                           0
amount_tsh                   0
funder                       0
gps_height                   0
installer                 3655
longitude                    0
latitude                     0
wpt_name                     2
num_private                  0
basin                        0
subvillage                 371
region                       0
region_code                  0
district_code                0
lga                          0
ward                         0
population                   0
public_meeting            3334
recorded_by                  0
scheme_management         3878
scheme_name              28810
permit                    3056
construction_year            0
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
management_group             0
payment                      0
payment_type                 0
water_quality                0
quality_group                0
quantity                     0
quantity_group               0
source                       0
source_type                  0
source_class                 0
waterpoint_type              0
waterpoint_type_group        0
status_group                 0
day                          0
month                        0
year                         0
dtype: int64

Para abordar la variable funder, el objetivo principal fue manejar los valores nulos sin perder un gran número de datos. Eliminar estas observaciones hubiera resultado en una pérdida significativa de información, lo cual no era deseable.

En lugar de eliminar las filas con valores nulos o imputar todos los valores nulos con la categoría más frecuente, lo que podría sesgar significativamente los datos, se optó por una estrategia más equilibrada:

Calcular las proporciones de cada categoría: Primero, se calcularon las proporciones de cada categoría en la variable funder en función del total de observaciones no nulas. Esto permitió entender la distribución actual de las categorías.

Determinar la cantidad de valores nulos: Luego, se determinó la cantidad total de valores nulos presentes en la variable funder.

Imputar valores nulos basado en las proporciones calculadas: Se calculó cuántos valores nulos deberían asignarse a cada categoría en función de sus proporciones. Esto se hizo multiplicando las proporciones por el número total de valores nulos y redondeando los resultados.

Ajuste de la distribución: Para asegurarse de que el total de valores imputados coincidiera exactamente con el número de valores nulos, se realizaron ajustes menores. Si había una discrepancia debido al redondeo, se ajustó el valor de la categoría más frecuente para compensar la diferencia.

Durante el análisis de la variable categórica funder, se identificaron las siguientes características y preocupaciones:

Gran número de categorías: La variable funder contiene un total de 1896 categorías, lo cual es un número considerablemente alto. Un gran número de categorías puede complicar el análisis y la interpretación de los resultados, además de incrementar la dimensionalidad del problema.

Distribución desigual de frecuencias: Al revisar las frecuencias de las diferentes categorías, se observa una gran disparidad. Por ejemplo, la categoría más común, "Government Of Tanzania", tiene 9084 observaciones, mientras que muchas otras categorías tienen frecuencias significativamente menores. Categorías con frecuencia muy baja: Se identificó que una gran cantidad de categorías tienen frecuencias muy bajas. Específicamente:

1690 categorías tienen una frecuencia menor a 30. 1614 categorías tienen una frecuencia menor a 20. 1464 categorías tienen una frecuencia menor a 10. Por lo que se propone una agrupación de baja frecuencia con el siguiente fin. Reducción de la Dimensionalidad, mejora de la Estabilidad del Modelo y significado Estadístico, ya que, las categorías con muy pocas observaciones no tienen un impacto significativo cuando se consideran individualmente. Sin embargo, al agruparlas, su impacto colectivo puede ser más significativo y útil para el análisis.

Este proceso será replicado con el resto de las variables que contienen valores nulos o faltantes. Cabe mencionar que en la variable scheme_name, algunas categorías consistían en nombres de una sola letra, como "K", y estas tenían una representación considerablemente alta en la frecuencia de dicha variable. Esto me llevó a cuestionar si este tipo de categorías de una sola letra eran correctas o si deberían ser redistribuidas de acuerdo con la frecuencia de otras categorías.

In [30]:
# Calcular las frecuencias de las categorías despues del ajuste
funder_counts = train['funder'].value_counts()

# Definir los tramos de frecuencia
rare_1_9 = funder_counts[funder_counts < 10].index
rare_10_49 = funder_counts[(funder_counts >= 10) & (funder_counts < 50)].index
rare_50_99 = funder_counts[(funder_counts >= 50) & (funder_counts < 100)].index

# Reemplazar las categorías según los tramos definidos
train['funder'] = train['funder'].apply(
    lambda x: 'Rare_1_9' if x in rare_1_9 else 
              ('Rare_10_49' if x in rare_10_49 else 
               ('Rare_50_99' if x in rare_50_99 else x))
)

# Mostrar los resultados
print(train['funder'].value_counts())
funder
Government Of Tanzania           9800
Rare_10_49                       6245
Rare_50_99                       3917
Danida                           3317
Rare_1_9                         2793
                                 ... 
Village Council                   112
Halmashauri Ya Wilaya Sikonge     109
Germany                           108
Hsw                               108
Twe                               103
Name: count, Length: 95, dtype: int64
In [31]:
import pandas as pd
import matplotlib.pyplot as plt

# Obtener la distribución de la columna 'funder'
funder_counts = train['funder'].value_counts()

# Graficar los valores más comunes
top_funders = funder_counts.head(40)  # Muestra los 40 funders más comunes

# Crear la figura del gráfico
plt.figure(figsize=(10, 8))
top_funders.plot(kind='bar')
plt.title('Top 40 Funders')
plt.xlabel('Funder')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.show()
No description has been provided for this image
In [32]:
installer_counts = train['installer'].value_counts()

print(installer_counts)
installer
DWE                17402
Government          1825
RWE                 1206
Commu               1060
DANIDA              1050
                   ...  
Wizara  ya maji        1
TWESS                  1
Nasan workers          1
R                      1
SELEPTA                1
Name: count, Length: 2145, dtype: int64
In [33]:
import pandas as pd
import matplotlib.pyplot as plt

# Obtener la distribución de la columna 'installer'
installer_counts = train['installer'].value_counts()

# Graficar los valores más comunes
top_installers = installer_counts.head(40)  # Muestra los 40 installers más comunes

# Crear la figura del gráfico
plt.figure(figsize=(10, 8))
top_installers.plot(kind='bar')
plt.title('Top 40 Installers')
plt.xlabel('Installer')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.show()
No description has been provided for this image
In [34]:
# Obtener la distribución de la columna 'installer'
installer_counts = train['installer'].value_counts()

# Filtrar los installers con frecuencia inferior a 30
installers_below_30 = installer_counts[installer_counts < 30]

# Mostrar los installers y sus frecuencias
print(installers_below_30)
installer
CCEC               29
ABASIA             29
Po                 29
PADEP              29
FinWater           29
                   ..
Wizara  ya maji     1
TWESS               1
Nasan workers       1
R                   1
SELEPTA             1
Name: count, Length: 1950, dtype: int64
In [35]:
# Obtener la distribución de la columna 'installer'
installer_counts = train['installer'].value_counts()

# Filtrar los installers con frecuencia inferior a 20
installers_below_20 = installer_counts[installer_counts < 20]

# Mostrar los installers y sus frecuencias
print(installers_below_20)
installer
Kuamu                   19
KIM KIM CONSTRUCTION    19
CG                      19
private                 19
SOWASA                  19
                        ..
Wizara  ya maji          1
TWESS                    1
Nasan workers            1
R                        1
SELEPTA                  1
Name: count, Length: 1876, dtype: int64
In [36]:
# Obtener la distribución de la columna 'installer'
installer_counts = train['installer'].value_counts()

# Filtrar los installers con frecuencia inferior a 10
installers_below_10 = installer_counts[installer_counts < 10]

# Mostrar los installers y sus frecuencias
print(installers_below_10)
installer
VILLAGERS          9
Healt              9
RUDEP              9
ACTIVE TANK CO     9
IDARA              9
                  ..
Wizara  ya maji    1
TWESS              1
Nasan workers      1
R                  1
SELEPTA            1
Name: count, Length: 1722, dtype: int64
In [37]:
# Obtener la distribución de la columna 'installer'
installer_counts = train['installer'].value_counts()

# Filtrar los installers con frecuencia inferior a 5
installers_below_5 = installer_counts[installer_counts < 5]

# Mostrar los installers y sus frecuencias
print(installers_below_5)
installer
Roman Ca           4
TGTS               4
Concen             4
TUMAINI FUND       4
JLH CO LTD         4
                  ..
Wizara  ya maji    1
TWESS              1
Nasan workers      1
R                  1
SELEPTA            1
Name: count, Length: 1526, dtype: int64
In [38]:
total_nulls_installer = train['installer'].isnull().sum()

print(total_nulls_installer)
3655
In [39]:
# Calcular la proporción de cada categoría sobre el total de observaciones no nulas
total_non_null = train['installer'].dropna().count()
proportions = train['installer'].value_counts() / total_non_null

# Obtener la cantidad de valores nulos
total_nulls = train['installer'].isnull().sum()

# Multiplicar las proporciones por el total de valores nulos
imputed_nulls = np.round(proportions * total_nulls).astype(int)

# Ajustar para asegurar que la suma sea igual al total de valores nulos
imputed_nulls_delta = total_nulls - imputed_nulls.sum()
if imputed_nulls_delta > 0:
    first_non_zero_idx = imputed_nulls[imputed_nulls > 0].index[0]
    imputed_nulls[first_non_zero_idx] += imputed_nulls_delta

# Crear una lista con las categorías según las cantidades calculadas
impute_values = []
for installer, count in imputed_nulls.items():
    impute_values.extend([installer] * count)

# Si hay una discrepancia, ajustar la lista impute_values
if len(impute_values) > total_nulls:
    impute_values = impute_values[:total_nulls]
elif len(impute_values) < total_nulls:
    additional_values = np.random.choice(proportions.index, size=total_nulls - len(impute_values), p=proportions.values)
    impute_values.extend(additional_values)

# Reemplazar los valores nulos en la columna 'installer'
null_indices = train['installer'][train['installer'].isnull()].index
train.loc[null_indices, 'installer'] = impute_values

# Mostrar los resultados
print("Total de valores nulos:", total_nulls)
print("Proporciones de cada categoría:")
print(proportions)
print("Cantidad imputada a cada categoría:")
print(imputed_nulls)
print("Valores imputados:")
print(train['installer'].value_counts())
Total de valores nulos: 3655
Proporciones de cada categoría:
installer
DWE                0.312171
Government         0.032738
RWE                0.021634
Commu              0.019015
DANIDA             0.018836
                     ...   
Wizara  ya maji    0.000018
TWESS              0.000018
Nasan workers      0.000018
R                  0.000018
SELEPTA            0.000018
Name: count, Length: 2145, dtype: float64
Cantidad imputada a cada categoría:
installer
DWE                1292
Government          120
RWE                  79
Commu                70
DANIDA               69
                   ... 
Wizara  ya maji       0
TWESS                 0
Nasan workers         0
R                     0
SELEPTA               0
Name: count, Length: 2145, dtype: int32
Valores imputados:
installer
DWE                18694
Government          1945
RWE                 1285
Commu               1130
DANIDA              1119
                   ...  
Wizara  ya maji        1
TWESS                  1
Nasan workers          1
R                      1
SELEPTA                1
Name: count, Length: 2145, dtype: int64
In [40]:
train.isnull().sum()
Out[40]:
id                           0
amount_tsh                   0
funder                       0
gps_height                   0
installer                    0
longitude                    0
latitude                     0
wpt_name                     2
num_private                  0
basin                        0
subvillage                 371
region                       0
region_code                  0
district_code                0
lga                          0
ward                         0
population                   0
public_meeting            3334
recorded_by                  0
scheme_management         3878
scheme_name              28810
permit                    3056
construction_year            0
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
management_group             0
payment                      0
payment_type                 0
water_quality                0
quality_group                0
quantity                     0
quantity_group               0
source                       0
source_type                  0
source_class                 0
waterpoint_type              0
waterpoint_type_group        0
status_group                 0
day                          0
month                        0
year                         0
dtype: int64
In [41]:
# Calcular las frecuencias de las categorías
installer_counts = train['installer'].value_counts()

# Definir los tramos de frecuencia
rare_menor_5 = installer_counts[installer_counts < 5].index
rare_5_20 = installer_counts[(installer_counts >= 5) & (installer_counts <= 20)].index
rare_menor_50_mayor_20 = installer_counts[(installer_counts > 20) & (installer_counts < 50)].index
rare_50_100 = installer_counts[(installer_counts >= 50) & (installer_counts <= 100)].index
rare_101_150 = installer_counts[(installer_counts > 100) & (installer_counts <= 150)].index
rare_151_200 = installer_counts[(installer_counts > 150) & (installer_counts <= 200)].index
rare_201_500 = installer_counts[(installer_counts > 200) & (installer_counts <= 500)].index

# Reemplazar las categorías según los tramos definidos
train['installer'] = train['installer'].apply(
    lambda x: 'Rare_menor_5' if x in rare_menor_5 else 
              ('Rare_5_20' if x in rare_5_20 else 
               ('Rare_menor_50_mayor_20' if x in rare_menor_50_mayor_20 else 
                ('Rare_50_100' if x in rare_50_100 else 
                 ('Rare_101_150' if x in rare_101_150 else 
                  ('Rare_151_200' if x in rare_151_200 else 
                   ('Rare_201_500' if x in rare_201_500 else x))))))
)

# Mostrar los resultados
print(train['installer'].value_counts())
installer
DWE                       18694
Rare_201_500               7694
Rare_50_100                4107
Rare_101_150               3892
Rare_menor_50_mayor_20     3732
Rare_5_20                  3568
Rare_151_200               2895
Rare_menor_5               2255
Government                 1945
RWE                        1285
Commu                      1130
DANIDA                     1119
KKKT                        957
Hesawa                      895
0                           828
TCRS                        753
Central government          663
CES                         650
Community                   589
DANID                       588
District Council            587
HESAWA                      574
Name: count, dtype: int64
In [42]:
installer_counts_subvillage = train['subvillage'].value_counts()

print(installer_counts_subvillage)
subvillage
Madukani        508
Shuleni         506
Majengo         502
Kati            373
Mtakuja         262
               ... 
Kipompo           1
Chanyamilima      1
Ikalime           1
Kemagaka          1
Kikatanyemba      1
Name: count, Length: 19287, dtype: int64
In [43]:
total_nulls_subvillage = train['subvillage'].isnull().sum()

print(total_nulls_subvillage)
371
In [44]:
import pandas as pd
import matplotlib.pyplot as plt

# Obtener la distribución de la columna 'subvillage'
subvillage_counts = train['subvillage'].value_counts()

# Graficar los valores más comunes
top_subvillages = subvillage_counts.head(40)  # Muestra los 40 subvillages más comunes

# Crear la figura del gráfico
plt.figure(figsize=(10, 8))
top_subvillages.plot(kind='bar')
plt.title('Top 40 Subvillages')
plt.xlabel('Subvillage')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.show()
No description has been provided for this image
In [45]:
# Calcular la proporción de cada categoría sobre el total de observaciones no nulas
total_non_null = train['subvillage'].dropna().count()
proportions = train['subvillage'].value_counts() / total_non_null

# Obtener la cantidad de valores nulos
total_nulls = train['subvillage'].isnull().sum()

# Multiplicar las proporciones por el total de valores nulos
imputed_nulls = np.round(proportions * total_nulls).astype(int)

# Ajustar para asegurar que la suma sea igual al total de valores nulos
imputed_nulls_delta = total_nulls - imputed_nulls.sum()
if imputed_nulls_delta > 0:
    first_non_zero_idx = imputed_nulls[imputed_nulls > 0].index[0]
    imputed_nulls[first_non_zero_idx] += imputed_nulls_delta

# Crear una lista con las categorías según las cantidades calculadas
impute_values = []
for subvillage, count in imputed_nulls.items():
    impute_values.extend([subvillage] * count)

# Si hay una discrepancia, ajustar la lista impute_values
if len(impute_values) > total_nulls:
    impute_values = impute_values[:total_nulls]
elif len(impute_values) < total_nulls:
    additional_values = np.random.choice(proportions.index, size=total_nulls - len(impute_values), p=proportions.values)
    impute_values.extend(additional_values)

# Reemplazar los valores nulos en la columna 'subvillage'
null_indices = train['subvillage'][train['subvillage'].isnull()].index
train.loc[null_indices, 'subvillage'] = impute_values

# Mostrar los resultados
print("Total de valores nulos:", total_nulls)
print("Proporciones de cada categoría:")
print(proportions)
print("Cantidad imputada a cada categoría:")
print(imputed_nulls)
print("Valores imputados:")
print(train['subvillage'].value_counts())
Total de valores nulos: 371
Proporciones de cada categoría:
subvillage
Madukani        0.008606
Shuleni         0.008572
Majengo         0.008504
Kati            0.006319
Mtakuja         0.004438
                  ...   
Kipompo         0.000017
Chanyamilima    0.000017
Ikalime         0.000017
Kemagaka        0.000017
Kikatanyemba    0.000017
Name: count, Length: 19287, dtype: float64
Cantidad imputada a cada categoría:
subvillage
Madukani        331
Shuleni           3
Majengo           3
Kati              2
Mtakuja           2
               ... 
Kipompo           0
Chanyamilima      0
Ikalime           0
Kemagaka          0
Kikatanyemba      0
Name: count, Length: 19287, dtype: int32
Valores imputados:
subvillage
Madukani        839
Shuleni         509
Majengo         505
Kati            375
Mtakuja         264
               ... 
Kipompo           1
Chanyamilima      1
Ikalime           1
Kemagaka          1
Kikatanyemba      1
Name: count, Length: 19287, dtype: int64
In [46]:
# Obtener la distribución de la columna 'installer'
subvillage_counts = train['subvillage'].value_counts()

# Filtrar los installers con frecuencia inferior a 30
subvillage_below_30 = subvillage_counts[subvillage_counts < 30]

# Mostrar los installers y sus frecuencias
print(subvillage_below_30)
subvillage
Kivukoni        29
Stesheni        29
Mandela         29
Lusungo         29
Uhuru           28
                ..
Kipompo          1
Chanyamilima     1
Ikalime          1
Kemagaka         1
Kikatanyemba     1
Name: count, Length: 19162, dtype: int64
In [47]:
# Obtener la distribución de la columna 'installer'
subvillage_counts = train['subvillage'].value_counts()

# Filtrar los installers con frecuencia inferior a 10
subvillage_below_10 = subvillage_counts[subvillage_counts < 10]

# Mostrar los installers y sus frecuencias
print(subvillage_below_10)
subvillage
Kinyika         9
Nselembwe       9
Isupilo         9
Kinjumbi A      9
Lusungu         9
               ..
Kipompo         1
Chanyamilima    1
Ikalime         1
Kemagaka        1
Kikatanyemba    1
Name: count, Length: 18560, dtype: int64
In [48]:
# Obtener la distribución de la columna 'installer'
subvillage_counts = train['subvillage'].value_counts()

# Filtrar los installers con frecuencia inferior a 5
subvillage_below_5 = subvillage_counts[subvillage_counts < 5]

# Mostrar los installers y sus frecuencias
print(subvillage_below_5)
subvillage
Mlati           4
Zizini          4
Nyarukaza       4
Leshaku         4
Ngongowele      4
               ..
Kipompo         1
Chanyamilima    1
Ikalime         1
Kemagaka        1
Kikatanyemba    1
Name: count, Length: 16569, dtype: int64
In [49]:
# Obtener la distribución de la columna 'installer'
subvillage_counts = train['subvillage'].value_counts()

# Filtrar los installers con frecuencia igual a 1
subvillage_below_1 = subvillage_counts[subvillage_counts == 1]

# Mostrar los installers y sus frecuencias
print(subvillage_below_1)
subvillage
Kahalawe        1
Hevizumbo       1
Hudighadi       1
Sekond          1
Mininga         1
               ..
Kipompo         1
Chanyamilima    1
Ikalime         1
Kemagaka        1
Kikatanyemba    1
Name: count, Length: 9424, dtype: int64
In [50]:
# Definir los tramos de frecuencia
rare_1 = subvillage_counts[subvillage_counts == 1].index
rare_2_4 = subvillage_counts[(subvillage_counts > 1) & (subvillage_counts < 5)].index
rare_5_10 = subvillage_counts[(subvillage_counts >= 5) & (subvillage_counts <= 10)].index
rare_11_20 = subvillage_counts[(subvillage_counts >= 11) & (subvillage_counts <= 20)].index
rare_21_50 = subvillage_counts[(subvillage_counts >= 21) & (subvillage_counts <= 50)].index
rare_51_100 = subvillage_counts[(subvillage_counts >= 51) & (subvillage_counts <= 100)].index
rare_101_plus = subvillage_counts[subvillage_counts > 100].index

# Reemplazar las categorías según los tramos definidos
train['subvillage'] = train['subvillage'].apply(
    lambda x: 'Rare_1' if x in rare_1 else 
              ('Rare_2_4' if x in rare_2_4 else 
               ('Rare_5_10' if x in rare_5_10 else 
                ('Rare_11_20' if x in rare_11_20 else 
                 ('Rare_21_50' if x in rare_21_50 else 
                  ('Rare_51_100' if x in rare_51_100 else 
                   ('Rare_101_plus' if x in rare_101_plus else x))))))
)

# Mostrar los resultados
print(train['subvillage'].value_counts())
subvillage
Rare_2_4         18809
Rare_5_10        13758
Rare_1            9424
Rare_11_20        5533
Rare_101_plus     4828
Rare_21_50        4345
Rare_51_100       2703
Name: count, dtype: int64
In [51]:
train.isnull().sum()
Out[51]:
id                           0
amount_tsh                   0
funder                       0
gps_height                   0
installer                    0
longitude                    0
latitude                     0
wpt_name                     2
num_private                  0
basin                        0
subvillage                   0
region                       0
region_code                  0
district_code                0
lga                          0
ward                         0
population                   0
public_meeting            3334
recorded_by                  0
scheme_management         3878
scheme_name              28810
permit                    3056
construction_year            0
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
management_group             0
payment                      0
payment_type                 0
water_quality                0
quality_group                0
quantity                     0
quantity_group               0
source                       0
source_type                  0
source_class                 0
waterpoint_type              0
waterpoint_type_group        0
status_group                 0
day                          0
month                        0
year                         0
dtype: int64
In [52]:
wpt_name_counts = train['wpt_name'].value_counts()

print(wpt_name_counts)
wpt_name
none                       3563
Shuleni                    1748
Zahanati                    830
Msikitini                   535
Kanisani                    323
                           ... 
Kwa Medadi                    1
Kwa Kubembeni                 1
Shule Ya Msingi Milanzi       1
Funua                         1
Kwa Mzee Lugawa               1
Name: count, Length: 37399, dtype: int64
In [53]:
import pandas as pd

# Filtrar las categorías con nombres de longitud menor o igual a 2
short_names = wpt_name_counts[wpt_name_counts.index.str.len() < 2]

# Mostrar el resultado
print(short_names)
print(f'Número de categorías con nombres de longitud <= 2: {len(short_names)}')
wpt_name
M    3
X    1
K    1
R    1
B    1
S    1
G    1
Name: count, dtype: int64
Número de categorías con nombres de longitud <= 2: 7
In [54]:
import matplotlib.pyplot as plt

# Obtener la distribución de la columna 'wpt_name'
wpt_name_counts = train['wpt_name'].value_counts()

# Graficar los valores más comunes
top_wpt_names = wpt_name_counts.head(40)  # Muestra los 40 nombres de puntos de agua más comunes

# Crear la figura del gráfico
plt.figure(figsize=(10, 8))
top_wpt_names.plot(kind='bar')
plt.title('Top 40 Water Point Names')
plt.xlabel('Water Point Name')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.show()
No description has been provided for this image
In [55]:
# Paso 1: Convertir "none" a NaN
train['wpt_name'] = train['wpt_name'].replace('none', np.nan)

# Paso 2: Contar valores nulos
total_nulls = train['wpt_name'].isnull().sum()
print(total_nulls)
3565
In [56]:
# Calcular la proporción de cada categoría sobre el total de observaciones no nulas
total_non_null = train['wpt_name'].dropna().count()
proportions = train['wpt_name'].value_counts() / total_non_null

# Obtener la cantidad de valores nulos
total_nulls = train['wpt_name'].isnull().sum()

# Multiplicar las proporciones por el total de valores nulos
imputed_nulls = np.round(proportions * total_nulls).astype(int)

# Ajustar para asegurar que la suma sea igual al total de valores nulos
imputed_nulls_delta = total_nulls - imputed_nulls.sum()
if imputed_nulls_delta > 0:
    first_non_zero_idx = imputed_nulls[imputed_nulls > 0].index[0]
    imputed_nulls[first_non_zero_idx] += imputed_nulls_delta

# Crear una lista con las categorías según las cantidades calculadas
impute_values = []
for wpt_name, count in imputed_nulls.items():
    impute_values.extend([wpt_name] * count)

# Si hay una discrepancia, ajustar la lista impute_values
if len(impute_values) > total_nulls:
    impute_values = impute_values[:total_nulls]
elif len(impute_values) < total_nulls:
    additional_values = np.random.choice(proportions.index, size=total_nulls - len(impute_values), p=proportions.values)
    impute_values.extend(additional_values)

# Reemplazar los valores nulos en la columna 'wpt_name'
null_indices = train['wpt_name'][train['wpt_name'].isnull()].index
train.loc[null_indices, 'wpt_name'] = impute_values

# Mostrar los resultados
print("Total de valores nulos:", total_nulls)
print("Proporciones de cada categoría:")
print(proportions)
print("Cantidad imputada a cada categoría:")
print(imputed_nulls)
print("Valores imputados:")
print(train['wpt_name'].value_counts())
Total de valores nulos: 3565
Proporciones de cada categoría:
wpt_name
Shuleni                    0.031307
Zahanati                   0.014865
Msikitini                  0.009582
Kanisani                   0.005785
Bombani                    0.004854
                             ...   
Kwa Medadi                 0.000018
Kwa Kubembeni              0.000018
Shule Ya Msingi Milanzi    0.000018
Funua                      0.000018
Kwa Mzee Lugawa            0.000018
Name: count, Length: 37398, dtype: float64
Cantidad imputada a cada categoría:
wpt_name
Shuleni                    2851
Zahanati                     53
Msikitini                    34
Kanisani                     21
Bombani                      17
                           ... 
Kwa Medadi                    0
Kwa Kubembeni                 0
Shule Ya Msingi Milanzi       0
Funua                         0
Kwa Mzee Lugawa               0
Name: count, Length: 37398, dtype: int32
Valores imputados:
wpt_name
Shuleni                    4599
Zahanati                    883
Msikitini                   569
Kanisani                    344
Bombani                     288
                           ... 
Kwa Medadi                    1
Kwa Kubembeni                 1
Shule Ya Msingi Milanzi       1
Funua                         1
Kwa Mzee Lugawa               1
Name: count, Length: 37398, dtype: int64
In [57]:
# Obtener la distribución de la columna 'wpt_name_'
wpt_name_counts = train['wpt_name'].value_counts()

# Filtrar los wpt_name_ con frecuencia inferior a 30
wpt_name_below_30 = wpt_name_counts[wpt_name_counts < 30]

# Mostrar los wpt_name_ y sus frecuencias
print(wpt_name_below_30)
wpt_name
Kwa Samweli                29
Lutheran Church            29
Machinjioni                28
Migombani                  28
Godown                     28
                           ..
Kwa Medadi                  1
Kwa Kubembeni               1
Shule Ya Msingi Milanzi     1
Funua                       1
Kwa Mzee Lugawa             1
Name: count, Length: 37324, dtype: int64
In [58]:
# Obtener la distribución de la columna 'wpt_name_'
wpt_name_counts = train['wpt_name'].value_counts()

# Filtrar los wpt_name_ con frecuencia igual a 1
wpt_name_below_1 = wpt_name_counts[wpt_name_counts == 1]

# Mostrar los wpt_name_ y sus frecuencias
print(wpt_name_below_1)
wpt_name
Binti Selemani             1
Mwambenene                 1
Kwa Stephano Mbao          1
Kwa Mzee Likalangi         1
Kashanga                   1
                          ..
Kwa Medadi                 1
Kwa Kubembeni              1
Shule Ya Msingi Milanzi    1
Funua                      1
Kwa Mzee Lugawa            1
Name: count, Length: 32928, dtype: int64
In [59]:
# Obtener la distribución de la columna 'wpt_name'
wpt_name_counts = train['wpt_name'].value_counts()

# Definir los tramos de frecuencia
rare_1 = wpt_name_counts[wpt_name_counts == 1].index
rare_2_30 = wpt_name_counts[(wpt_name_counts > 1) & (wpt_name_counts <= 30)].index
rare_31_100 = wpt_name_counts[(wpt_name_counts > 30) & (wpt_name_counts <= 100)].index
rare_101_500 = wpt_name_counts[(wpt_name_counts > 100) & (wpt_name_counts <= 500)].index

# Reemplazar las categorías según los tramos definidos
train['wpt_name'] = train['wpt_name'].apply(
    lambda x: 'Rare_1' if x in rare_1 else
              ('Rare_2_30' if x in rare_2_30 else
               ('Rare_31_100' if x in rare_31_100 else
                ('Rare_101_500' if x in rare_101_500 else x))
    )
)

# Mostrar los resultados
print(train['wpt_name'].value_counts())
wpt_name
Rare_1          32928
Rare_2_30       14968
Shuleni          4599
Rare_31_100      3154
Rare_101_500     2299
Zahanati          883
Msikitini         569
Name: count, dtype: int64
In [60]:
train.isnull().sum()
Out[60]:
id                           0
amount_tsh                   0
funder                       0
gps_height                   0
installer                    0
longitude                    0
latitude                     0
wpt_name                     0
num_private                  0
basin                        0
subvillage                   0
region                       0
region_code                  0
district_code                0
lga                          0
ward                         0
population                   0
public_meeting            3334
recorded_by                  0
scheme_management         3878
scheme_name              28810
permit                    3056
construction_year            0
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
management_group             0
payment                      0
payment_type                 0
water_quality                0
quality_group                0
quantity                     0
quantity_group               0
source                       0
source_type                  0
source_class                 0
waterpoint_type              0
waterpoint_type_group        0
status_group                 0
day                          0
month                        0
year                         0
dtype: int64
In [61]:
public_meeting_counts = train['public_meeting'].value_counts()

print(public_meeting_counts)
public_meeting
True     51011
False     5055
Name: count, dtype: int64
In [62]:
# Obtener la distribución de la columna 'public_meeting'
public_meeting_counts = train['public_meeting'].value_counts()

# Crear una copia para evitar modificar los datos originales
public_meeting_counts_modified = public_meeting_counts.copy()

# Reemplazar True por 1 y False por 0 en la copia
public_meeting_counts_modified.index = public_meeting_counts_modified.index.map({True: 1, False: 0}.get)

# Mostrar los resultados
print(public_meeting_counts_modified)
public_meeting
1    51011
0     5055
Name: count, dtype: int64
In [63]:
# Contar valores nulos
total_nulls_public_meeting = train['public_meeting'].isnull().sum()
print(total_nulls_public_meeting)
3334
In [64]:
# Calcular la proporción de cada categoría sobre el total de observaciones no nulas
total_non_null = train['public_meeting'].dropna().count()
proportions = train['public_meeting'].value_counts() / total_non_null

# Obtener la cantidad de valores nulos
total_nulls = train['public_meeting'].isnull().sum()

# Multiplicar las proporciones por el total de valores nulos
imputed_nulls = np.round(proportions * total_nulls).astype(int)

# Ajustar para asegurar que la suma sea igual al total de valores nulos
imputed_nulls_delta = total_nulls - imputed_nulls.sum()
if imputed_nulls_delta > 0:
    first_non_zero_idx = imputed_nulls[imputed_nulls > 0].index[0]
    imputed_nulls[first_non_zero_idx] += imputed_nulls_delta

# Crear una lista con las categorías según las cantidades calculadas
impute_values = []
for public_meeting, count in imputed_nulls.items():
    impute_values.extend([public_meeting] * count)

# Si hay una discrepancia, ajustar la lista impute_values
if len(impute_values) > total_nulls:
    impute_values = impute_values[:total_nulls]
elif len(impute_values) < total_nulls:
    additional_values = np.random.choice(proportions.index, size=total_nulls - len(impute_values), p=proportions.values)
    impute_values.extend(additional_values)

# Reemplazar los valores nulos en la columna 'public_meeting'
null_indices = train['public_meeting'][train['public_meeting'].isnull()].index
train.loc[null_indices, 'public_meeting'] = impute_values

# Mostrar los resultados
print("Total de valores nulos:", total_nulls)
print("Proporciones de cada categoría:")
print(proportions)
print("Cantidad imputada a cada categoría:")
print(imputed_nulls)
print("Valores imputados:")
print(train['public_meeting'].value_counts())
Total de valores nulos: 3334
Proporciones de cada categoría:
public_meeting
True     0.909838
False    0.090162
Name: count, dtype: float64
Cantidad imputada a cada categoría:
public_meeting
True     3033
False     301
Name: count, dtype: int32
Valores imputados:
public_meeting
True     54044
False     5356
Name: count, dtype: int64
In [65]:
train.isnull().sum()
Out[65]:
id                           0
amount_tsh                   0
funder                       0
gps_height                   0
installer                    0
longitude                    0
latitude                     0
wpt_name                     0
num_private                  0
basin                        0
subvillage                   0
region                       0
region_code                  0
district_code                0
lga                          0
ward                         0
population                   0
public_meeting               0
recorded_by                  0
scheme_management         3878
scheme_name              28810
permit                    3056
construction_year            0
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
management_group             0
payment                      0
payment_type                 0
water_quality                0
quality_group                0
quantity                     0
quantity_group               0
source                       0
source_type                  0
source_class                 0
waterpoint_type              0
waterpoint_type_group        0
status_group                 0
day                          0
month                        0
year                         0
dtype: int64
In [66]:
scheme_management_counts = train['scheme_management'].value_counts()

print(scheme_management_counts)
scheme_management
VWC                 36793
WUG                  5206
Water authority      3153
WUA                  2883
Water Board          2748
Parastatal           1680
Private operator     1063
Company              1061
Other                 766
SWC                    97
Trust                  72
Name: count, dtype: int64
In [67]:
import numpy as np

# Calcular la proporción de cada categoría sobre el total de observaciones no nulas
total_non_null = train['scheme_management'].dropna().count()
proportions = train['scheme_management'].value_counts() / total_non_null

# Obtener la cantidad de valores nulos
total_nulls = train['scheme_management'].isnull().sum()

# Multiplicar las proporciones por el total de valores nulos
imputed_nulls = np.round(proportions * total_nulls).astype(int)

# Ajustar para asegurar que la suma sea igual al total de valores nulos
imputed_nulls_delta = total_nulls - imputed_nulls.sum()
if imputed_nulls_delta > 0:
    first_non_zero_idx = imputed_nulls[imputed_nulls > 0].index[0]
    imputed_nulls[first_non_zero_idx] += imputed_nulls_delta

# Crear una lista con las categorías según las cantidades calculadas
impute_values = []
for scheme_management, count in imputed_nulls.items():
    impute_values.extend([scheme_management] * count)

# Si hay una discrepancia, ajustar la lista impute_values
if len(impute_values) > total_nulls:
    impute_values = impute_values[:total_nulls]
elif len(impute_values) < total_nulls:
    additional_values = np.random.choice(proportions.index, size=total_nulls - len(impute_values), p=proportions.values)
    impute_values.extend(additional_values)

# Reemplazar los valores nulos en la columna 'scheme_management'
null_indices = train['scheme_management'][train['scheme_management'].isnull()].index
train.loc[null_indices, 'scheme_management'] = impute_values

# Mostrar los resultados
print("Total de valores nulos:", total_nulls)
print("Proporciones de cada categoría:")
print(proportions)
print("Cantidad imputada a cada categoría:")
print(imputed_nulls)
print("Valores imputados:")
print(train['scheme_management'].value_counts())
Total de valores nulos: 3878
Proporciones de cada categoría:
scheme_management
VWC                 0.662674
WUG                 0.093765
Water authority     0.056788
WUA                 0.051925
Water Board         0.049494
Parastatal          0.030258
Private operator    0.019146
Company             0.019110
Other               0.013796
SWC                 0.001747
Trust               0.001297
Name: count, dtype: float64
Cantidad imputada a cada categoría:
scheme_management
VWC                 2570
WUG                  364
Water authority      220
WUA                  201
Water Board          192
Parastatal           117
Private operator      74
Company               74
Other                 54
SWC                    7
Trust                  5
Name: count, dtype: int32
Valores imputados:
scheme_management
VWC                 39363
WUG                  5570
Water authority      3373
WUA                  3084
Water Board          2940
Parastatal           1797
Private operator     1137
Company              1135
Other                 820
SWC                   104
Trust                  77
Name: count, dtype: int64
In [68]:
train.isnull().sum()
Out[68]:
id                           0
amount_tsh                   0
funder                       0
gps_height                   0
installer                    0
longitude                    0
latitude                     0
wpt_name                     0
num_private                  0
basin                        0
subvillage                   0
region                       0
region_code                  0
district_code                0
lga                          0
ward                         0
population                   0
public_meeting               0
recorded_by                  0
scheme_management            0
scheme_name              28810
permit                    3056
construction_year            0
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
management_group             0
payment                      0
payment_type                 0
water_quality                0
quality_group                0
quantity                     0
quantity_group               0
source                       0
source_type                  0
source_class                 0
waterpoint_type              0
waterpoint_type_group        0
status_group                 0
day                          0
month                        0
year                         0
dtype: int64
In [69]:
scheme_name_counts = train['scheme_name'].value_counts()

print(scheme_name_counts)
scheme_name
K                       682
Borehole                546
Chalinze wate           405
M                       400
DANIDA                  379
                       ... 
Mradi wa maji Vijini      1
Villagers                 1
Magundi water supply      1
Saadani Chumv             1
Mtawanya                  1
Name: count, Length: 2695, dtype: int64
In [70]:
# Filtrar las categorías con nombres de longitud menor o igual a 2
short_names = scheme_name_counts[scheme_name_counts.index.str.len() < 2]

# Mostrar el resultado
print(short_names)
print(f'Número de categorías con nombres de longitud <= 2: {len(short_names)}')
scheme_name
K    682
M    400
I    229
N    204
S    154
D     78
G     71
J     59
U     58
A     42
W     39
B     37
T     17
L     16
P     12
E      7
O      6
k      3
C      2
V      2
v      1
Name: count, dtype: int64
Número de categorías con nombres de longitud <= 2: 21
In [71]:
# Calcular la proporción de cada categoría sobre el total de observaciones no nulas
total_non_null = train['scheme_name'].dropna().count()
proportions = train['scheme_name'].value_counts() / total_non_null

# Obtener la cantidad de valores nulos
total_nulls = train['scheme_name'].isnull().sum()

# Multiplicar las proporciones por el total de valores nulos
imputed_nulls = np.round(proportions * total_nulls).astype(int)

# Ajustar para asegurar que la suma sea igual al total de valores nulos
imputed_nulls_delta = total_nulls - imputed_nulls.sum()
if imputed_nulls_delta > 0:
    first_non_zero_idx = imputed_nulls[imputed_nulls > 0].index[0]
    imputed_nulls[first_non_zero_idx] += imputed_nulls_delta

# Crear una lista con las categorías según las cantidades calculadas
impute_values = []
for scheme_name, count in imputed_nulls.items():
    impute_values.extend([scheme_name] * count)

# Si hay una discrepancia, ajustar la lista impute_values
if len(impute_values) > total_nulls:
    impute_values = impute_values[:total_nulls]
elif len(impute_values) < total_nulls:
    additional_values = np.random.choice(proportions.index, size=total_nulls - len(impute_values), p=proportions.values)
    impute_values.extend(additional_values)

# Reemplazar los valores nulos en la columna 'scheme_name'
null_indices = train['scheme_name'][train['scheme_name'].isnull()].index
train.loc[null_indices, 'scheme_name'] = impute_values

# Mostrar los resultados
print("Total de valores nulos:", total_nulls)
print("Proporciones de cada categoría:")
print(proportions)
print("Cantidad imputada a cada categoría:")
print(imputed_nulls)
print("Valores imputados:")
print(train['scheme_name'].value_counts())
Total de valores nulos: 28810
Proporciones de cada categoría:
scheme_name
K                       0.022295
Borehole                0.017849
Chalinze wate           0.013240
M                       0.013076
DANIDA                  0.012390
                          ...   
Mradi wa maji Vijini    0.000033
Villagers               0.000033
Magundi water supply    0.000033
Saadani Chumv           0.000033
Mtawanya                0.000033
Name: count, Length: 2695, dtype: float64
Cantidad imputada a cada categoría:
scheme_name
K                       642
Borehole                514
Chalinze wate           381
M                       377
DANIDA                  357
                       ... 
Mradi wa maji Vijini      1
Villagers                 1
Magundi water supply      1
Saadani Chumv             1
Mtawanya                  1
Name: count, Length: 2695, dtype: int32
Valores imputados:
scheme_name
K                       1324
Borehole                1060
Chalinze wate            786
M                        777
DANIDA                   736
                        ... 
Mradi wa maji Vijini       1
Villagers                  1
Magundi water supply       1
Saadani Chumv              1
Mtawanya                   1
Name: count, Length: 2695, dtype: int64
In [72]:
# Obtener la distribución de la columna 'scheme_name'
scheme_name_counts = train['scheme_name'].value_counts()

# Filtrar los scheme_name con frecuencia igual a 1
scheme_name_below_1 = scheme_name_counts[scheme_name_counts < 3]

# Mostrar los scheme_name y sus frecuencias
print(scheme_name_below_1)
scheme_name
River                     2
TM part Two               2
Forforo water supply      2
Maga                      2
Mkabenga spring source    2
                         ..
Mradi wa maji Vijini      1
Villagers                 1
Magundi water supply      1
Saadani Chumv             1
Mtawanya                  1
Name: count, Length: 712, dtype: int64
In [73]:
# Obtener la distribución de la columna 'scheme_name'
scheme_name_counts = train['scheme_name'].value_counts()

# Filtrar los scheme_name con frecuencia igual a 1
scheme_name_below_1 = scheme_name_counts[scheme_name_counts < 7]

# Mostrar los scheme_name y sus frecuencias
print(scheme_name_below_1)
scheme_name
Mrufiti                        6
Mkutimango water supply        6
BL Lang'ata                    6
Mangu windmill piped scheme    6
Kampogonyo water               6
                              ..
Mradi wa maji Vijini           1
Villagers                      1
Magundi water supply           1
Saadani Chumv                  1
Mtawanya                       1
Name: count, Length: 1191, dtype: int64
In [74]:
# Obtener la distribución de la columna 'scheme_name'
scheme_name_counts = train['scheme_name'].value_counts()

# Filtrar los scheme_name con frecuencia igual a 1
scheme_name_below_1 = scheme_name_counts[scheme_name_counts < 20]

# Mostrar los scheme_name y sus frecuencias
print(scheme_name_below_1)
scheme_name
Mradi wa maji Nyanduga    19
Kiny                      19
Msirwa pipeline           19
Songota pipe line         19
Lupe                      19
                          ..
Mradi wa maji Vijini       1
Villagers                  1
Magundi water supply       1
Saadani Chumv              1
Mtawanya                   1
Name: count, Length: 1964, dtype: int64
In [75]:
# Obtener la distribución de la columna 'scheme_name'
scheme_name_counts = train['scheme_name'].value_counts()

# Definir los tramos de frecuencia
rare_less_3 = scheme_name_counts[scheme_name_counts < 3].index
rare_3_7 = scheme_name_counts[(scheme_name_counts >= 3) & (scheme_name_counts <= 7)].index
rare_8_12 = scheme_name_counts[(scheme_name_counts >= 8) & (scheme_name_counts <= 12)].index
rare_13_16 = scheme_name_counts[(scheme_name_counts >= 13) & (scheme_name_counts <= 16)].index
rare_17_20 = scheme_name_counts[(scheme_name_counts >= 17) & (scheme_name_counts <= 20)].index
rare_21_25 = scheme_name_counts[(scheme_name_counts >= 21) & (scheme_name_counts <= 25)].index
rare_26_30 = scheme_name_counts[(scheme_name_counts >= 26) & (scheme_name_counts <= 30)].index
rare_31_35 = scheme_name_counts[(scheme_name_counts >= 31) & (scheme_name_counts <= 35)].index
rare_36_40 = scheme_name_counts[(scheme_name_counts >= 36) & (scheme_name_counts <= 40)].index
rare_41_45 = scheme_name_counts[(scheme_name_counts >= 41) & (scheme_name_counts <= 45)].index
rare_46_50 = scheme_name_counts[(scheme_name_counts >= 46) & (scheme_name_counts <= 50)].index
rare_51_55 = scheme_name_counts[(scheme_name_counts >= 51) & (scheme_name_counts <= 55)].index
rare_56_64 = scheme_name_counts[(scheme_name_counts >= 56) & (scheme_name_counts <= 64)].index
rare_65_70 = scheme_name_counts[(scheme_name_counts >= 65) & (scheme_name_counts <= 70)].index
rare_71_80 = scheme_name_counts[(scheme_name_counts >= 71) & (scheme_name_counts <= 80)].index
rare_81_90 = scheme_name_counts[(scheme_name_counts >= 81) & (scheme_name_counts <= 90)].index
rare_91_100 = scheme_name_counts[(scheme_name_counts >= 91) & (scheme_name_counts <= 100)].index
rare_101_120 = scheme_name_counts[(scheme_name_counts >= 101) & (scheme_name_counts <= 120)].index
rare_121_140 = scheme_name_counts[(scheme_name_counts >= 121) & (scheme_name_counts <= 140)].index
rare_141_160 = scheme_name_counts[(scheme_name_counts >= 141) & (scheme_name_counts <= 160)].index
rare_161_180 = scheme_name_counts[(scheme_name_counts >= 161) & (scheme_name_counts <= 180)].index
rare_181_200 = scheme_name_counts[(scheme_name_counts >= 181) & (scheme_name_counts <= 200)].index

# Reemplazar las categorías según los tramos definidos
train['scheme_name'] = train['scheme_name'].apply(lambda x: 
    'Rare_less_3' if x in rare_less_3 else
    'Rare_3_7' if x in rare_3_7 else
    'Rare_8_12' if x in rare_8_12 else
    'Rare_13_16' if x in rare_13_16 else
    'Rare_17_20' if x in rare_17_20 else
    'Rare_21_25' if x in rare_21_25 else
    'Rare_26_30' if x in rare_26_30 else
    'Rare_31_35' if x in rare_31_35 else
    'Rare_36_40' if x in rare_36_40 else
    'Rare_41_45' if x in rare_41_45 else
    'Rare_46_50' if x in rare_46_50 else
    'Rare_51_55' if x in rare_51_55 else
    'Rare_56_64' if x in rare_56_64 else
    'Rare_65_70' if x in rare_65_70 else
    'Rare_71_80' if x in rare_71_80 else
    'Rare_81_90' if x in rare_81_90 else
    'Rare_91_100' if x in rare_91_100 else
    'Rare_101_120' if x in rare_101_120 else
    'Rare_121_140' if x in rare_121_140 else
    'Rare_141_160' if x in rare_141_160 else
    'Rare_161_180' if x in rare_161_180 else
    'Rare_181_200' if x in rare_181_200 else
    x
)

# Mostrar los resultados
print(train['scheme_name'].value_counts())
scheme_name
Rare_8_12                                4346
Rare_21_25                               3915
Rare_31_35                               3315
Rare_13_16                               2872
Rare_56_64                               2804
Rare_101_120                             2556
Rare_17_20                               2482
Rare_46_50                               2476
Rare_26_30                               2408
Rare_41_45                               2389
Rare_3_7                                 2268
Rare_81_90                               1959
Rare_91_100                              1807
Rare_71_80                               1676
Rare_36_40                               1596
Rare_161_180                             1366
Rare_65_70                               1360
Rare_141_160                             1334
K                                        1324
Rare_less_3                              1212
Rare_121_140                             1174
Borehole                                 1060
Chalinze wate                             786
M                                         777
Rare_51_55                                744
DANIDA                                    736
Government                                621
Ngana water supplied scheme               524
wanging'ombe water supply s               507
wanging'ombe supply scheme                454
Bagamoyo wate                             445
I                                         445
Uroki-Bomang'ombe water sup               406
N                                         396
Rare_181_200                              392
Kirua kahe gravity water supply trust     375
Machumba estate pipe line                 359
Makwale water supplied sche               322
Kijiji                                    313
S                                         299
Losaa-Kia water supply                    295
Handeni Trunk Main(H                      295
mtwango water supply scheme               295
Mkongoro Two                              285
Roman                                     270
Mkongoro One                              249
Maambreni gravity water supply            243
Lyamungo umbwe water supply               233
Kirua kahe pumping water trust            219
Nabaiye pipe line                         208
Ki                                        208
Name: count, dtype: int64
In [76]:
train.isnull().sum()
Out[76]:
id                          0
amount_tsh                  0
funder                      0
gps_height                  0
installer                   0
longitude                   0
latitude                    0
wpt_name                    0
num_private                 0
basin                       0
subvillage                  0
region                      0
region_code                 0
district_code               0
lga                         0
ward                        0
population                  0
public_meeting              0
recorded_by                 0
scheme_management           0
scheme_name                 0
permit                   3056
construction_year           0
extraction_type             0
extraction_type_group       0
extraction_type_class       0
management                  0
management_group            0
payment                     0
payment_type                0
water_quality               0
quality_group               0
quantity                    0
quantity_group              0
source                      0
source_type                 0
source_class                0
waterpoint_type             0
waterpoint_type_group       0
status_group                0
day                         0
month                       0
year                        0
dtype: int64
In [77]:
permit_counts = train['permit'].value_counts()

print(permit_counts)
permit
True     38852
False    17492
Name: count, dtype: int64
In [78]:
# Obtener la distribución de la columna 'public_meeting'
permit_counts = train['permit'].value_counts()

# Crear una copia para evitar modificar los datos originales
permit_counts_modified = permit_counts.copy()

# Reemplazar True por 1 y False por 0 en la copia
permit_counts_modified.index = permit_counts_modified.index.map({True: 1, False: 0}.get)

# Mostrar los resultados
print(permit_counts_modified)
permit
1    38852
0    17492
Name: count, dtype: int64
In [79]:
# Contar valores nulos
total_nulls_permit = train['permit'].isnull().sum()
print(total_nulls_permit)
3056
In [80]:
# Calcular la proporción de cada categoría sobre el total de observaciones no nulas
total_non_null = train['permit'].dropna().count()
proportions = train['permit'].value_counts() / total_non_null

# Obtener la cantidad de valores nulos
total_nulls = train['permit'].isnull().sum()

# Multiplicar las proporciones por el total de valores nulos
imputed_nulls = np.round(proportions * total_nulls).astype(int)

# Ajustar para asegurar que la suma sea igual al total de valores nulos
imputed_nulls_delta = total_nulls - imputed_nulls.sum()
if imputed_nulls_delta > 0:
    first_non_zero_idx = imputed_nulls[imputed_nulls > 0].index[0]
    imputed_nulls[first_non_zero_idx] += imputed_nulls_delta

# Crear una lista con las categorías según las cantidades calculadas
impute_values = []
for permit, count in imputed_nulls.items():
    impute_values.extend([permit] * count)

# Si hay una discrepancia, ajustar la lista impute_values
if len(impute_values) > total_nulls:
    impute_values = impute_values[:total_nulls]
elif len(impute_values) < total_nulls:
    additional_values = np.random.choice(proportions.index, size=total_nulls - len(impute_values), p=proportions.values)
    impute_values.extend(additional_values)

# Reemplazar los valores nulos en la columna 'permit'
null_indices = train['permit'][train['permit'].isnull()].index
train.loc[null_indices, 'permit'] = impute_values

# Mostrar los resultados
print("Total de valores nulos:", total_nulls)
print("Proporciones de cada categoría:")
print(proportions)
print("Cantidad imputada a cada categoría:")
print(imputed_nulls)
print("Valores imputados:")
print(train['permit'].value_counts())
Total de valores nulos: 3056
Proporciones de cada categoría:
permit
True     0.68955
False    0.31045
Name: count, dtype: float64
Cantidad imputada a cada categoría:
permit
True     2107
False     949
Name: count, dtype: int32
Valores imputados:
permit
True     40959
False    18441
Name: count, dtype: int64
In [81]:
train.isnull().sum()
Out[81]:
id                       0
amount_tsh               0
funder                   0
gps_height               0
installer                0
longitude                0
latitude                 0
wpt_name                 0
num_private              0
basin                    0
subvillage               0
region                   0
region_code              0
district_code            0
lga                      0
ward                     0
population               0
public_meeting           0
recorded_by              0
scheme_management        0
scheme_name              0
permit                   0
construction_year        0
extraction_type          0
extraction_type_group    0
extraction_type_class    0
management               0
management_group         0
payment                  0
payment_type             0
water_quality            0
quality_group            0
quantity                 0
quantity_group           0
source                   0
source_type              0
source_class             0
waterpoint_type          0
waterpoint_type_group    0
status_group             0
day                      0
month                    0
year                     0
dtype: int64
In [82]:
amount_tsh_counts = train['amount_tsh'].value_counts()

print(amount_tsh_counts)
amount_tsh
0.0         41639
500.0        3102
50.0         2472
1000.0       1488
20.0         1463
            ...  
6300.0          1
120000.0        1
138000.0        1
350000.0        1
59.0            1
Name: count, Length: 98, dtype: int64
In [83]:
import seaborn as sns
import matplotlib.pyplot as plt
import math

# Calcular el número de subplots necesarios
num_plots = len(cat_cols)
num_cols = 5
num_rows = math.ceil(num_plots / num_cols)

# Crear subplots
fig, axs = plt.subplots(ncols=num_cols, nrows=num_rows, figsize=(20, 5 * num_rows))
axs = axs.flatten()

# Inicializar índice
index = 0

# Iterar sobre las columnas categóricas
for col in cat_cols:
    sns.countplot(data=train, x=col, ax=axs[index])
    index += 1

# Ajustar el diseño
plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=5.0)
plt.show()
No description has been provided for this image
In [84]:
num_cols = train.select_dtypes(include='number').columns.tolist()
#Variables numéricas
fig, axs = plt.subplots(ncols=5, nrows=3, figsize=(20, 10))
axs = axs.flatten() # 

index = 0
for k,v in train[num_cols].items(): 
    if  (k == 'permanencia'):
        sns.countplot(v, ax=axs[index])
    else:
        sns.distplot(v, bins=20, ax=axs[index])
    index += 1

plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=5.0)
C:\Users\Fran\AppData\Local\Temp\ipykernel_18340\2586130762.py:11: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(v, bins=20, ax=axs[index])
C:\Users\Fran\AppData\Local\Temp\ipykernel_18340\2586130762.py:11: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(v, bins=20, ax=axs[index])
C:\Users\Fran\AppData\Local\Temp\ipykernel_18340\2586130762.py:11: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(v, bins=20, ax=axs[index])
C:\Users\Fran\AppData\Local\Temp\ipykernel_18340\2586130762.py:11: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(v, bins=20, ax=axs[index])
C:\Users\Fran\AppData\Local\Temp\ipykernel_18340\2586130762.py:11: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(v, bins=20, ax=axs[index])
C:\Users\Fran\AppData\Local\Temp\ipykernel_18340\2586130762.py:11: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(v, bins=20, ax=axs[index])
C:\Users\Fran\AppData\Local\Temp\ipykernel_18340\2586130762.py:11: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(v, bins=20, ax=axs[index])
C:\Users\Fran\AppData\Local\Temp\ipykernel_18340\2586130762.py:11: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(v, bins=20, ax=axs[index])
C:\Users\Fran\AppData\Local\Temp\ipykernel_18340\2586130762.py:11: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(v, bins=20, ax=axs[index])
C:\Users\Fran\AppData\Local\Temp\ipykernel_18340\2586130762.py:11: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(v, bins=20, ax=axs[index])
C:\Users\Fran\AppData\Local\Temp\ipykernel_18340\2586130762.py:11: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(v, bins=20, ax=axs[index])
C:\Users\Fran\AppData\Local\Temp\ipykernel_18340\2586130762.py:11: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(v, bins=20, ax=axs[index])
C:\Users\Fran\AppData\Local\Temp\ipykernel_18340\2586130762.py:11: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(v, bins=20, ax=axs[index])
No description has been provided for this image
In [85]:
import pandas as pd
import numpy as np

# Calcular la proporción de cada año de construcción sobre el total de observaciones no nulas
total_non_zero = train['construction_year'].loc[train['construction_year'] != 0].count()
proportions = train['construction_year'].loc[train['construction_year'] != 0].value_counts() / total_non_zero

# Obtener la cantidad de valores 0
total_zeros = (train['construction_year'] == 0).sum()

# Multiplicar las proporciones por el total de valores 0
imputed_zeros = np.round(proportions * total_zeros).astype(int)

# Ajustar para asegurar que la suma sea igual al total de valores 0
imputed_zeros_delta = total_zeros - imputed_zeros.sum()
if imputed_zeros_delta > 0:
    first_non_zero_idx = imputed_zeros[imputed_zeros > 0].index[0]
    imputed_zeros[first_non_zero_idx] += imputed_zeros_delta

# Crear una lista con los años según las cantidades calculadas
impute_values = []
for year, count in imputed_zeros.items():
    impute_values.extend([year] * count)

# Si hay una discrepancia, ajustar la lista impute_values
if len(impute_values) > total_zeros:
    impute_values = impute_values[:total_zeros]
elif len(impute_values) < total_zeros:
    additional_values = np.random.choice(proportions.index, size=total_zeros - len(impute_values), p=proportions.values)
    impute_values.extend(additional_values)

# Reemplazar los valores 0 en la columna 'construction_year'
zero_indices = train['construction_year'][train['construction_year'] == 0].index
train.loc[zero_indices, 'construction_year'] = impute_values

# Mostrar los resultados
print("Total de valores 0:", total_zeros)
print("Proporciones de cada año:")
print(proportions)
print("Cantidad imputada a cada año:")
print(imputed_zeros)
print("Valores imputados:")
print(train['construction_year'].value_counts().sort_index())
Total de valores 0: 20709
Proporciones de cada año:
construction_year
2010    0.068362
2008    0.067535
2009    0.065467
2000    0.054044
2007    0.041017
2006    0.038019
2003    0.033238
2011    0.032462
2004    0.029025
2012    0.028017
2002    0.027784
1978    0.026802
1995    0.026208
2005    0.026130
1999    0.025303
1998    0.024967
1990    0.024657
1985    0.024424
1996    0.020961
1980    0.020961
1984    0.020134
1982    0.019229
1994    0.019074
1972    0.018299
1974    0.017472
1997    0.016645
1992    0.016541
1993    0.015714
2001    0.013957
1988    0.013466
1983    0.012613
1975    0.011295
1986    0.011217
1976    0.010700
1970    0.010623
1991    0.008374
1989    0.008167
1987    0.007805
1981    0.006151
1977    0.005221
1979    0.004962
1973    0.004756
2013    0.004549
1971    0.003748
1960    0.002636
1967    0.002274
1963    0.002197
1968    0.001990
1969    0.001525
1964    0.001034
1962    0.000775
1961    0.000543
1965    0.000491
1966    0.000439
Name: count, dtype: float64
Cantidad imputada a cada año:
construction_year
2010    1418
2008    1399
2009    1356
2000    1119
2007     849
2006     787
2003     688
2011     672
2004     601
2012     580
2002     575
1978     555
1995     543
2005     541
1999     524
1998     517
1990     511
1985     506
1996     434
1980     434
1984     417
1982     398
1994     395
1972     379
1974     362
1997     345
1992     343
1993     325
2001     289
1988     279
1983     261
1975     234
1986     232
1976     222
1970     220
1991     173
1989     169
1987     162
1981     127
1977     108
1979     103
1973      98
2013      94
1971      78
1960      55
1967      47
1963      45
1968      41
1969      32
1964      21
1962      16
1961      11
1965      10
1966       9
Name: count, dtype: int32
Valores imputados:
construction_year
1960     157
1961      32
1962      46
1963     130
1964      61
1965      29
1966      26
1967     135
1968     118
1969      91
1970     631
1971     223
1972    1087
1973     282
1974    1038
1975     671
1976     636
1977     310
1978    1592
1979     295
1980    1245
1981     365
1982    1142
1983     749
1984    1196
1985    1451
1986     666
1987     464
1988     800
1989     485
1990    1465
1991     497
1992     983
1993     933
1994    1133
1995    1557
1996    1245
1997     989
1998    1483
1999    1503
2000    3210
2001     829
2002    1650
2003    1974
2004    1724
2005    1552
2006    2258
2007    2436
2008    4012
2009    3889
2010    4063
2011    1928
2012    1664
2013     270
Name: count, dtype: int64
In [86]:
# Visualizar la distribución de construction_year después de reemplazar los ceros
plt.hist(train['construction_year'], bins=30, color='skyblue', edgecolor='black')
plt.title('Histograma de construction_year (después de reemplazar ceros)')
plt.xlabel('Año de construcción')
plt.ylabel('Frecuencia')
plt.show()

# Boxplot para visualizar outliers después de reemplazar los ceros
plt.boxplot(train['construction_year'], vert=False)
plt.title('Boxplot de construction_year (después de reemplazar ceros)')
plt.xlabel('Año de construcción')
plt.show()
No description has been provided for this image
No description has been provided for this image
In [87]:
train.loc[train['population'] == 30500.000000, 'population'] = 150
In [88]:
import pandas as pd
import matplotlib.pyplot as plt

# Visualizar la distribución de population
plt.hist(train['population'], bins=30, color='skyblue', edgecolor='black')
plt.title('Histograma de population')
plt.xlabel('Población')
plt.ylabel('Frecuencia')
plt.show()

# Boxplot para visualizar outliers
plt.boxplot(train['population'], vert=False)
plt.title('Boxplot de population')
plt.xlabel('Población')
plt.show()

# Ver las estadísticas descriptivas de population
print(train['population'].describe())

# Definir los umbrales basados en cuantiles
low_threshold = train['population'].quantile(0.25)
medium_threshold = train['population'].quantile(0.75)

print(f'Low threshold: {low_threshold}')
print(f'Medium threshold: {medium_threshold}')

# Función para convertir la población en categorías
def categorize_population(pop):
    if pop == 0:
        return 'No population'
    elif pop <= low_threshold:
        return 'Low population'
    elif pop <= medium_threshold:
        return 'Medium population'
    else:
        return 'High population'

# Aplicar la función al DataFrame
train['population_category'] = train['population'].apply(categorize_population)

# Verificar la nueva columna
print(train['population_category'].value_counts())

# Visualizar la distribución de la nueva variable categórica
train['population_category'].value_counts().plot(kind='bar', color='skyblue', edgecolor='black')
plt.title('Distribución de categorías de población')
plt.xlabel('Categoría de población')
plt.ylabel('Frecuencia')
plt.show()
No description has been provided for this image
No description has been provided for this image
count    59400.00000
mean       179.39904
std        454.77288
min          0.00000
25%          0.00000
50%         25.00000
75%        214.25000
max      15300.00000
Name: population, dtype: float64
Low threshold: 0.0
Medium threshold: 214.25
population_category
Medium population    23169
No population        21381
High population      14850
Name: count, dtype: int64
No description has been provided for this image
In [89]:
train.drop('population', axis=1, inplace=True)
In [90]:
# Obtener el conteo de valores de amount_tsh
amount_tsh_counts = train['amount_tsh'].value_counts()

# Mostrar los primeros valores (por ejemplo, los 10 primeros)
print(amount_tsh_counts.head(10))

# Graficar la distribución de amount_tsh
plt.figure(figsize=(10, 6))
amount_tsh_counts.plot(kind='bar', color='skyblue', edgecolor='black')
plt.title('Distribución de amount_tsh')
plt.xlabel('amount_tsh')
plt.ylabel('Frecuencia')
plt.show()
amount_tsh
0.0       41639
500.0      3102
50.0       2472
1000.0     1488
20.0       1463
200.0      1220
100.0       816
10.0        806
30.0        743
2000.0      704
Name: count, dtype: int64
No description has been provided for this image
In [91]:
# Definir la función para categorizar amount_tsh
def categorize_amount_tsh(amount):
    if amount == 0:
        return 'No water point'
    elif amount <= 50:
        return 'Low water availability'
    elif amount <= 500:
        return 'Medium water availability'
    else:
        return 'High water availability'

# Aplicar la función al DataFrame para crear una nueva columna categorizada
train['amount_tsh_category'] = train['amount_tsh'].apply(categorize_amount_tsh)

# Eliminar la columna original amount_tsh
train.drop('amount_tsh', axis=1, inplace=True)

# Verificar que la columna original haya sido eliminada
print(train)
          id           funder  gps_height               installer  longitude  \
0      69572            Roman        1390             Rare_50_100  34.938093   
1       8776       Rare_50_99        1399  Rare_menor_50_mayor_20  34.698766   
2      34310         Rare_1_9         686            Rare_201_500  37.460664   
3      67743           Unicef         263            Rare_201_500  38.486161   
4      19728         Rare_1_9           0            Rare_101_150  31.130847   
...      ...              ...         ...                     ...        ...   
59395  60739  Germany Republi        1210                     CES  37.169807   
59396  27263       Rare_10_49        1212               Rare_5_20  35.249991   
59397  37057         Rare_1_9           0               Rare_5_20  34.017087   
59398  31282         Rare_1_9           0               Rare_5_20  35.861315   
59399  26348       World Bank         191            Rare_101_150  38.104048   

        latitude   wpt_name  num_private                    basin  \
0      -9.856322    Shuleni            0               Lake Nyasa   
1      -2.147466   Zahanati            0            Lake Victoria   
2      -3.821329  Rare_2_30            0                  Pangani   
3     -11.155298     Rare_1            0  Ruvuma / Southern Coast   
4      -1.825359    Shuleni            0            Lake Victoria   
...          ...        ...          ...                      ...   
59395  -3.253847     Rare_1            0                  Pangani   
59396  -9.070629     Rare_1            0                   Rufiji   
59397  -8.750434  Rare_2_30            0                   Rufiji   
59398  -6.378573     Rare_1            0                   Rufiji   
59399  -6.747464     Rare_1            0              Wami / Ruvu   

          subvillage  ...           source_type  source_class  \
0          Rare_5_10  ...                spring   groundwater   
1          Rare_5_10  ...  rainwater harvesting       surface   
2      Rare_101_plus  ...                   dam       surface   
3         Rare_11_20  ...              borehole   groundwater   
4           Rare_2_4  ...  rainwater harvesting       surface   
...              ...  ...                   ...           ...   
59395       Rare_2_4  ...                spring   groundwater   
59396     Rare_11_20  ...            river/lake       surface   
59397       Rare_2_4  ...              borehole   groundwater   
59398     Rare_21_50  ...          shallow well   groundwater   
59399         Rare_1  ...          shallow well   groundwater   

                   waterpoint_type waterpoint_type_group    status_group day  \
0               communal standpipe    communal standpipe      functional  14   
1               communal standpipe    communal standpipe      functional   6   
2      communal standpipe multiple    communal standpipe      functional  25   
3      communal standpipe multiple    communal standpipe  non functional  28   
4               communal standpipe    communal standpipe      functional  13   
...                            ...                   ...             ...  ..   
59395           communal standpipe    communal standpipe      functional   3   
59396           communal standpipe    communal standpipe      functional   7   
59397                    hand pump             hand pump      functional  11   
59398                    hand pump             hand pump      functional   8   
59399                    hand pump             hand pump      functional  23   

      month  year population_category      amount_tsh_category  
0         3  2011   Medium population  High water availability  
1         3  2013     High population           No water point  
2         2  2013     High population   Low water availability  
3         1  2013   Medium population           No water point  
4         7  2011       No population           No water point  
...     ...   ...                 ...                      ...  
59395     5  2013   Medium population   Low water availability  
59396     5  2011   Medium population  High water availability  
59397     4  2011       No population           No water point  
59398     3  2011       No population           No water point  
59399     3  2011   Medium population           No water point  

[59400 rows x 43 columns]
In [92]:
# Identificar las columnas numéricas
num_cols = train.select_dtypes(include=[np.number]).columns
print("Columnas numéricas:", num_cols)

def remove_outliers(df, columns):
    for col in columns:
        # Calcular el primer y tercer cuartil
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        
        # Calcular el rango intercuartil (IQR)
        IQR = Q3 - Q1
        
        # Definir los límites para los outliers
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        # Filtrar los datos para eliminar los outliers
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    
    return df

# Eliminar outliers para todas las columnas numéricas
train_cleaned = remove_outliers(train, num_cols)
Columnas numéricas: Index(['id', 'gps_height', 'longitude', 'latitude', 'num_private',
       'region_code', 'district_code', 'construction_year', 'day', 'month',
       'year'],
      dtype='object')
In [93]:
# Visualizar histogramas y boxplots para todas las columnas numéricas
fig, axes = plt.subplots(nrows=len(num_cols), ncols=2, figsize=(15, 5 * len(num_cols)))

for i, col in enumerate(num_cols):
    # Histograma
    axes[i, 0].hist(train_cleaned[col], bins=100, color='skyblue', edgecolor='black')
    axes[i, 0].set_title(f'Histograma de {col} (sin outliers)')
    axes[i, 0].set_xlabel(col)
    axes[i, 0].set_ylabel('Frecuencia')

    # Diagrama de caja (boxplot)
    axes[i, 1].boxplot(train_cleaned[col], vert=False)
    axes[i, 1].set_title(f'Boxplot de {col} (sin outliers)')
    axes[i, 1].set_xlabel(col)

plt.tight_layout()
plt.show()
No description has been provided for this image

Para abordar la variable population, se aplicó una estrategia similar a la utilizada para la variable funder, con el objetivo de manejar los valores nulos sin perder una gran cantidad de datos. Eliminar las observaciones con valores nulos hubiera resultado en una pérdida significativa de información, lo cual no era deseable.

Proceso de Imputación de Valores Nulos

  1. Calcular las proporciones de cada categoría: Primero, se calcularon las proporciones de cada categoría en la variable population en función del total de observaciones no nulas. Esto permitió entender la distribución actual de las categorías.

  2. Determinar la cantidad de valores nulos: Luego, se determinó la cantidad total de valores nulos presentes en la variable population.

  3. Imputar valores nulos basado en las proporciones calculadas: Se calculó cuántos valores nulos deberían asignarse a cada categoría en función de sus proporciones. Esto se hizo multiplicando las proporciones por el número total de valores nulos y redondeando los resultados.

  4. Ajuste de la distribución: Para asegurarse de que el total de valores imputados coincidiera exactamente con el número de valores nulos, se realizaron ajustes menores. Si había una discrepancia debido al redondeo, se ajustó el valor de la categoría más frecuente para compensar la diferencia.

Agrupación y Transformación de la Variable Population A diferencia de la variable funder, en lugar de hacer una reagrupación posterior a la imputación de valores nulos, se decidió transformar la variable population en tres grupos distintos. Esta decisión se basó en la representación de la población que habitaba cerca de lugares con bombas de agua y en el análisis de un boxplot que mostraba que muchas observaciones se perdían debido a la distribución de los datos.

Transformación en Grupos:

Sin Población: Incluye observaciones donde la población es 0. Población Baja: Incluye observaciones donde la población es menor o igual al percentil 0.25. Población Media: Incluye observaciones donde la población es menor o igual al percentil 0.75. Alta Población: Incluye observaciones donde la población es mayor al percentil 0.75. Esta transformación permitió simplificar la variable population, reduciendo la dimensionalidad y mejorando la estabilidad del modelo, sin perder un número significativo de observaciones.

Aplicación en la Variable Amount El mismo enfoque también se aplicó a la variable amount. Se crearon grupos basados en los cuantiles de la distribución de la variable amount, asegurando así una representación más equitativa y manejable de los datos en el análisis.

In [95]:
train_cleaned[num_cols].describe()
Out[95]:
id gps_height longitude latitude num_private region_code district_code construction_year day month year
count 51249.000000 51249.000000 51249.000000 51249.000000 51249.0 51249.000000 51249.000000 51249.000000 51249.000000 51249.000000 51249.000000
mean 37127.048586 739.339851 34.897465 -5.748341 0.0 10.794142 3.180140 1996.867041 15.357431 4.448984 2011.908057
std 21470.596454 700.761113 2.422784 2.736180 0.0 6.022995 1.773173 12.406447 8.669823 3.002525 0.939689
min 0.000000 -90.000000 29.607122 -11.649440 0.0 1.000000 0.000000 1960.000000 1.000000 1.000000 2011.000000
25% 18530.000000 0.000000 33.204679 -8.385803 0.0 5.000000 2.000000 1988.000000 7.000000 2.000000 2011.000000
50% 37072.000000 694.000000 34.827052 -5.038109 0.0 11.000000 3.000000 2000.000000 16.000000 3.000000 2012.000000
75% 55685.000000 1367.000000 36.894172 -3.358872 0.0 16.000000 4.000000 2008.000000 23.000000 7.000000 2013.000000
max 74247.000000 2770.000000 40.231707 -0.998464 0.0 21.000000 7.000000 2013.000000 31.000000 12.000000 2013.000000
In [96]:
cat_cols1= train_cleaned.select_dtypes(include=['object','category']).columns
print(train_cleaned[cat_cols1].describe().transpose())
                       count unique                      top   freq
funder                 51249     95   Government Of Tanzania   8855
installer              51249     22                      DWE  16238
wpt_name               51249      7                   Rare_1  28683
basin                  51249      9                  Pangani   8115
subvillage             51249      7                 Rare_2_4  16236
region                 51249     21                   Iringa   5286
lga                    51249    111                   Njombe   2502
ward                   51249   1788                    Igosi    307
public_meeting         51249      2                     True  46728
recorded_by            51249      1  GeoData Consultants Ltd  51249
scheme_management      51249     11                      VWC  33620
scheme_name            51249     51                Rare_8_12   3656
permit                 51249      2                     True  36342
extraction_type        51249     18                  gravity  24639
extraction_type_group  51249     13                  gravity  24639
extraction_type_class  51249      7                  gravity  24639
management             51249     12                      vwc  34726
management_group       51249      5               user-group  45190
payment                51249      7                never pay  21541
payment_type           51249      7                never pay  21541
water_quality          51249      8                     soft  44414
quality_group          51249      6                     good  44414
quantity               51249      5                   enough  28618
quantity_group         51249      5                   enough  28618
source                 51249     10                   spring  15623
source_type            51249      7                   spring  15623
source_class           51249      3              groundwater  39040
waterpoint_type        51249      7       communal standpipe  26132
waterpoint_type_group  51249      6       communal standpipe  30996
status_group           51249      3               functional  28412
population_category    51249      3        Medium population  20417
amount_tsh_category    51249      4           No water point  35544
In [97]:
train_cleaned.dtypes
Out[97]:
id                         int64
funder                    object
gps_height                 int64
installer                 object
longitude                float64
latitude                 float64
wpt_name                  object
num_private                int64
basin                     object
subvillage                object
region                    object
region_code                int64
district_code              int64
lga                       object
ward                      object
public_meeting            object
recorded_by               object
scheme_management         object
scheme_name               object
permit                    object
construction_year          int64
extraction_type           object
extraction_type_group     object
extraction_type_class     object
management                object
management_group          object
payment                   object
payment_type              object
water_quality             object
quality_group             object
quantity                  object
quantity_group            object
source                    object
source_type               object
source_class              object
waterpoint_type           object
waterpoint_type_group     object
status_group              object
day                        int32
month                      int32
year                       int32
population_category       object
amount_tsh_category       object
dtype: object
In [98]:
variables_dummies = ['funder','installer','wpt_name','basin','subvillage','region','lga','ward','scheme_management','recorded_by','scheme_name','extraction_type','extraction_type_group','extraction_type_class','management','management_group','payment','payment_type','water_quality','quality_group','quantity','quantity_group','source','source_type','source_class','waterpoint_type','waterpoint_type_group','status_group','population_category','amount_tsh_category','public_meeting','permit']
In [99]:
import pickle

# Lista de variables categóricas a convertir
variables_categoricas = variables_dummies

# Crear un diccionario para almacenar los mapeos inversos
mapeo_inverso = {}

# Iterar sobre las variables categóricas y convertirlas en numéricas
for var in variables_categoricas:
    # Factorizar y obtener los códigos y los niveles originales
    codes, uniques = pd.factorize(train_cleaned[var])
    
    # Crear un diccionario de mapeo inverso
    mapeo_inverso[var] = dict(zip(range(1, len(uniques) + 1), uniques))
    
    # Reemplazar la columna original por la columna numérica + 1 (para empezar desde 1)
    train_cleaned[var] = codes + 1

# Verificar el resultado y mostrar el mapeo inverso para 'funder' como ejemplo
print(train_cleaned.head())

# Nombre del archivo donde guardarás el diccionario
nombre_archivo = 'mapeo_inverso.pkl'

# Guardar el diccionario mapeo_inverso en un archivo usando pickle
with open(nombre_archivo, 'wb') as f:
    pickle.dump(mapeo_inverso, f)

print(f"Diccionario mapeo_inverso guardado en {nombre_archivo}")

# Mostrar el mapeo inverso para todas las variables categóricas
for var, mapeo in mapeo_inverso.items():
    print(f"Mapeo inverso para '{var}': {mapeo}")
      id  funder  gps_height  installer  longitude  latitude  wpt_name  \
0  69572       1        1390          1  34.938093 -9.856322         1   
1   8776       2        1399          2  34.698766 -2.147466         2   
2  34310       3         686          3  37.460664 -3.821329         3   
4  19728       3           0          4  31.130847 -1.825359         1   
6  19816       4           0          3  33.362410 -3.766365         4   

   num_private  basin  subvillage  ...  source_type  source_class  \
0            0      1           1  ...            1             1   
1            0      2           1  ...            2             2   
2            0      3           2  ...            3             2   
4            0      2           3  ...            2             2   
6            0      4           3  ...            4             1   

   waterpoint_type  waterpoint_type_group  status_group  day  month  year  \
0                1                      1             1   14      3  2011   
1                1                      1             1    6      3  2013   
2                2                      1             1   25      2  2013   
4                1                      1             1   13      7  2011   
6                3                      2             2    1     10  2012   

   population_category  amount_tsh_category  
0                    1                    1  
1                    2                    2  
2                    2                    3  
4                    3                    2  
6                    3                    2  

[5 rows x 43 columns]
Diccionario mapeo_inverso guardado en mapeo_inverso.pkl
Mapeo inverso para 'funder': {1: 'Roman', 2: 'Rare_50_99', 3: 'Rare_1_9', 4: 'Dwsp', 5: 'Rwssp', 6: 'Wateraid', 7: 'Rare_10_49', 8: 'Danida', 9: 'World Vision', 10: 'Lawatefuka Water Supply', 11: 'Rudep', 12: 'Unicef', 13: 'Hesawa', 14: 'Twe', 15: 'Isf', 16: 'Government Of Tanzania', 17: 'Water', 18: 'Private Individual', 19: 'Ces(gmbh)', 20: 'District Council', 21: 'Muwsa', 22: 'Kkkt_makwale', 23: 'Ces (gmbh)', 24: 'Kkkt', 25: 'Roman Catholic', 26: 'Norad', 27: 'Adra', 28: 'Dwe', 29: 'Rc Church', 30: 'Swedish', 31: 'He', 32: 'Tcrs', 33: 'Germany Republi', 34: 'Netherlands', 35: 'Nethalan', 36: 'Tasaf', 37: 'Concern World Wide', 38: 'World Bank', 39: '0', 40: 'Shipo', 41: 'Oxfarm', 42: 'Village Council', 43: 'Dhv', 44: 'Ir', 45: 'Oikos E.Afrika', 46: 'Anglican Church', 47: 'Ministry Of Water', 48: 'Dfid', 49: 'Rural Water Supply And Sanitat', 50: 'Jica', 51: 'Amref', 52: 'Wananchi', 53: 'No', 54: 'Co', 55: 'Tassaf', 56: 'Finw', 57: 'Fini Water', 58: 'Go', 59: 'Oxfam', 60: 'Community', 61: 'Plan Int', 62: 'Lvia', 63: 'Wvt', 64: 'Hifab', 65: 'Snv', 66: 'Is', 67: 'Private', 68: 'Villagers', 69: 'Lga', 70: 'Magadini-makiwaru Water', 71: 'Adb', 72: 'Dmdd', 73: 'Mission', 74: 'Ru', 75: 'Halmashauri Ya Wilaya Sikonge', 76: 'Ki', 77: 'Tardo', 78: 'Ded', 79: 'African', 80: 'Rc', 81: 'Wua', 82: 'Dh', 83: 'Wsdp', 84: 'Hsw', 85: 'Kiliwater', 86: 'Lamp', 87: 'Bsf', 88: 'Unice', 89: 'Jaica', 90: 'Unhcr', 91: 'W.B', 92: 'Germany', 93: 'Concern', 94: 'Fw', 95: 'Mkinga Distric Coun'}
Mapeo inverso para 'installer': {1: 'Rare_50_100', 2: 'Rare_menor_50_mayor_20', 3: 'Rare_201_500', 4: 'Rare_101_150', 5: 'DWE', 6: 'DANIDA', 7: 'Rare_151_200', 8: 'Rare_5_20', 9: 'Central government', 10: 'Commu', 11: 'KKKT', 12: 'RWE', 13: 'HESAWA', 14: 'Community', 15: 'Rare_menor_5', 16: 'Government', 17: 'CES', 18: 'Hesawa', 19: '0', 20: 'DANID', 21: 'TCRS', 22: 'District Council'}
Mapeo inverso para 'wpt_name': {1: 'Shuleni', 2: 'Zahanati', 3: 'Rare_2_30', 4: 'Rare_1', 5: 'Rare_101_500', 6: 'Rare_31_100', 7: 'Msikitini'}
Mapeo inverso para 'basin': {1: 'Lake Nyasa', 2: 'Lake Victoria', 3: 'Pangani', 4: 'Internal', 5: 'Lake Tanganyika', 6: 'Rufiji', 7: 'Wami / Ruvu', 8: 'Lake Rukwa', 9: 'Ruvuma / Southern Coast'}
Mapeo inverso para 'subvillage': {1: 'Rare_5_10', 2: 'Rare_101_plus', 3: 'Rare_2_4', 4: 'Rare_1', 5: 'Rare_21_50', 6: 'Rare_11_20', 7: 'Rare_51_100'}
Mapeo inverso para 'region': {1: 'Iringa', 2: 'Mara', 3: 'Manyara', 4: 'Kagera', 5: 'Shinyanga', 6: 'Tabora', 7: 'Ruvuma', 8: 'Kilimanjaro', 9: 'Rukwa', 10: 'Mwanza', 11: 'Kigoma', 12: 'Dodoma', 13: 'Pwani', 14: 'Arusha', 15: 'Mbeya', 16: 'Singida', 17: 'Tanga', 18: 'Morogoro', 19: 'Lindi', 20: 'Dar es Salaam', 21: 'Mtwara'}
Mapeo inverso para 'lga': {1: 'Ludewa', 2: 'Serengeti', 3: 'Simanjiro', 4: 'Karagwe', 5: 'Shinyanga Rural', 6: 'Kahama', 7: 'Tabora Urban', 8: 'Namtumbo', 9: 'Maswa', 10: 'Siha', 11: 'Meatu', 12: 'Sumbawanga Rural', 13: 'Njombe', 14: 'Ukerewe', 15: 'Same', 16: 'Kigoma Rural', 17: 'Moshi Rural', 18: 'Rombo', 19: 'Chamwino', 20: 'Bagamoyo', 21: 'Arusha Rural', 22: 'Kyela', 23: 'Kondoa', 24: 'Kilolo', 25: 'Kibondo', 26: 'Makete', 27: 'Singida Rural', 28: 'Rungwe', 29: 'Moshi Urban', 30: 'Mbulu', 31: 'Bukoba Rural', 32: 'Muheza', 33: 'Lushoto', 34: 'Meru', 35: 'Iramba', 36: 'Kilombero', 37: 'Mbarali', 38: 'Kasulu', 39: 'Bukoba Urban', 40: 'Korogwe', 41: 'Bukombe', 42: 'Morogoro Rural', 43: 'Musoma Rural', 44: 'Sengerema', 45: 'Iringa Rural', 46: 'Muleba', 47: 'Dodoma Urban', 48: 'Hanang', 49: 'Misenyi', 50: 'Missungwi', 51: 'Songea Rural', 52: 'Tanga', 53: 'Tunduru', 54: 'Hai', 55: 'Mwanga', 56: 'Biharamulo', 57: 'Ileje', 58: 'Mpwapwa', 59: 'Mvomero', 60: 'Bunda', 61: 'Kiteto', 62: 'Longido', 63: 'Urambo', 64: 'Mbozi', 65: 'Lindi Rural', 66: 'Sikonge', 67: 'Ilala', 68: 'Tarime', 69: 'Temeke', 70: 'Mbeya Rural', 71: 'Magu', 72: 'Manyoni', 73: 'Igunga', 74: 'Kilosa', 75: 'Babati', 76: 'Chunya', 77: 'Mufindi', 78: 'Mpanda', 79: 'Kibaha', 80: 'Singida Urban', 81: 'Nzega', 82: 'Mkuranga', 83: 'Nkasi', 84: 'Bahi', 85: 'Mbinga', 86: 'Ulanga', 87: 'Sumbawanga Urban', 88: 'Morogoro Urban', 89: 'Tandahimba', 90: 'Mtwara Urban', 91: 'Kongwa', 92: 'Uyui', 93: 'Bariadi', 94: 'Kwimba', 95: 'Monduli', 96: 'Shinyanga Urban', 97: 'Ngorongoro', 98: 'Rorya', 99: 'Pangani', 100: 'Nachingwea', 101: 'Kisarawe', 102: 'Kinondoni', 103: 'Kigoma Urban', 104: 'Ilemela', 105: 'Handeni', 106: 'Kilindi', 107: 'Kilwa', 108: 'Arusha Urban', 109: 'Songea Urban', 110: 'Nyamagana', 111: 'Mafia'}
Mapeo inverso para 'ward': {1: 'Mundindi', 2: 'Natta', 3: 'Ngorika', 4: 'Nyakasimbi', 5: 'Samuye', 6: 'Chambo', 7: 'Itetemia', 8: 'Kaisho', 9: 'Msindo', 10: 'Busilili', 11: 'Siha Kaskazini', 12: 'Nkoma', 13: 'Mkowe', 14: 'Mdandu', 15: 'Usuka', 16: 'Ilangala', 17: 'Mawengi', 18: 'Matola', 19: 'Kisiwani', 20: 'Mkigo', 21: 'Kimochi', 22: 'Nyaishozi', 23: 'Mengwe Manda', 24: 'Siha Kati', 25: 'Dabalo', 26: 'Chalinze', 27: 'Kilema  Kati', 28: 'Lupanga', 29: 'Oltrumet', 30: 'Katumbasongwe', 31: 'Mnenia', 32: "Boma la ng'ombe", 33: 'Rugongowe', 34: 'Kahe', 35: 'Ipelele', 36: 'Mtwango', 37: 'Merya', 38: 'Nduruma', 39: 'Kiwira', 40: 'Wangama', 41: 'Msaranga', 42: 'Laela', 43: 'Makwale', 44: 'Kiruruma', 45: 'Njoro', 46: 'Kurua Vunjo Mashariki', 47: 'Oltroto', 48: 'Bargish', 49: 'Izimbya', 50: 'Kicheba', 51: 'Hemtoye', 52: 'Kagongo', 53: 'Kikatiti', 54: 'Nduguti', 55: 'Naintiri', 56: 'Ikuka', 57: 'Isingiro', 58: 'Ilembula', 59: 'Utengule', 60: 'Mawindi', 61: 'Ubaruku', 62: 'Manchira', 63: 'Kitagata', 64: 'Kibeta', 65: 'Imalinyi', 66: 'Mbuguni', 67: 'Mashewa', 68: 'Ushirika', 69: "Mang'oto", 70: 'Tawa', 71: 'Nkoanrua', 72: 'Buswahili', 73: 'Bupandwamhela', 74: 'Magulilwa', 75: 'Kasharunga', 76: 'Chihanga', 77: 'Mwamgongo', 78: 'Kiromo', 79: 'Measkron', 80: 'Kyaka', 81: 'Nyamnyusi', 82: 'Muyama', 83: 'Misasi', 84: 'Mpitimbi', 85: 'Mwantini', 86: 'Mabokweni', 87: 'Ibaga', 88: 'Bunyambo', 89: 'Marumba', 90: 'Masama Mashariki', 91: 'Jipe', 92: 'Isanga', 93: 'Kiwere', 94: 'Lutebe', 95: 'Maji ya Chai', 96: 'Lusahunga', 97: 'Ndola', 98: 'Kimagai', 99: 'Kashenye', 100: 'Mzumbe', 101: 'Igundu', 102: 'Lengatei', 103: 'Lupalilo', 104: 'Kaibanja', 105: 'Runazi', 106: 'Namanga', 107: 'Itete', 108: 'Urambo', 109: 'Ikungu', 110: 'Jana', 111: 'Usagara', 112: 'Utengule Usangu', 113: 'Ukune', 114: 'Mbaramo', 115: 'Madope', 116: 'Ndalambo', 117: 'Nachunyu', 118: 'Kalenge', 119: 'Nyamigogo', 120: 'Kyela Urban', 121: 'Mkambalani', 122: 'Igigwa', 123: 'Mvomero', 124: 'Kiroka', 125: 'Kitunda', 126: 'Ruhanga', 127: 'Mahongole', 128: 'Rujewa', 129: 'Sirari', 130: 'Igurusi', 131: 'Kimbiji', 132: 'Kebanchabancha', 133: 'Muhinda', 134: 'Muhuwesi', 135: 'Igale', 136: 'Masumbwe', 137: 'Ipililo', 138: 'Miangalua', 139: "Wanging'ombe", 140: 'Ruhita', 141: 'Nyanguge', 142: 'Makuru', 143: 'Nkinga', 144: 'Mganza', 145: 'Mkongo', 146: 'Mtowisa', 147: 'Lugoba', 148: 'Ifakara', 149: 'Kidete', 150: 'Msanzi', 151: 'Magara', 152: 'Iyula', 153: 'Magila', 154: 'Mseke', 155: 'Ngarenanyuki', 156: 'Ukumbi', 157: 'Mbangala', 158: 'Kassambya', 159: 'Ilungu', 160: 'Siuyu', 161: 'Nyololo', 162: 'Isalavanu', 163: 'Suji', 164: 'Kitirima Kingachi', 165: 'Neruma', 166: 'Mudida', 167: 'Nyanga', 168: 'Msia', 169: 'Akheri', 170: 'Mwaya', 171: 'Mishamo', 172: 'Mwembe', 173: 'Bujonde', 174: 'Mabogini', 175: 'Itwangi', 176: 'Olkokola', 177: 'Ruanda', 178: 'Kirangare', 179: 'Munzenze', 180: 'Kibaha', 181: 'Chakwale', 182: 'Mandewa', 183: 'Mbarika', 184: 'Msambiazi', 185: 'Mukituntu', 186: 'Maghojea', 187: 'Hidet', 188: 'Iwindi', 189: 'Miguwa', 190: 'Hembeti', 191: 'Kanga', 192: 'Pongwe', 193: 'Rubeho', 194: 'Mbalawala', 195: 'Mahuninga', 196: 'Bukembe', 197: 'Kihorogota', 198: 'Nkumba', 199: 'Kirua Vunjo Magharibi', 200: 'Magawa', 201: 'Mwika Kusini', 202: 'Ipepo', 203: 'Nyandekwa', 204: 'Namanyere', 205: 'Chikuyu', 206: 'Mbabala', 207: 'Imesela', 208: 'Zanka', 209: 'Msambara', 210: 'Ruvu', 211: 'Ilima', 212: 'Sikonge', 213: 'Nambinzo', 214: 'Didia', 215: 'Lipingo', 216: 'Mtimbira', 217: 'Kirongwe', 218: 'Chala', 219: 'Munyegera', 220: 'Kibondo Urban', 221: 'Murieti', 222: 'Uvinza', 223: 'Okoani Kibosho', 224: 'Dakama', 225: 'Ntendo', 226: 'Gwanumpu', 227: 'Mzinga', 228: 'Mwanga', 229: 'Mbokomu', 230: 'Mkoreha', 231: 'Bwisya', 232: 'Nkangamo', 233: 'Ilola', 234: 'Ikolo', 235: 'Kisondela', 236: 'Ufana', 237: 'Inyala', 238: 'Mabamba', 239: 'Mambwenkoswe', 240: 'Magengeni', 241: 'Chunyu', 242: 'Sintali', 243: 'Mwandiga', 244: 'Mtipwili', 245: 'Hombolo', 246: 'Mbebe', 247: 'Mpindo', 248: 'Nandembo', 249: 'Babayu', 250: 'Kiberege', 251: 'Katazi', 252: 'Mwashiku', 253: 'Isongo', 254: 'Sagara', 255: 'Moivo', 256: 'Endasak', 257: 'Marangu Mashariki', 258: 'Liuli', 259: 'Kidugalo', 260: 'Mererani', 261: 'Rusesa', 262: 'Ihanja', 263: 'Mbugani', 264: 'Bwawani', 265: 'Karambi', 266: 'Tingatinga', 267: 'Kongwa Urban', 268: 'Sunya', 269: 'Lupila', 270: 'Choma', 271: 'Kidatu', 272: 'USA River', 273: 'Kibara', 274: 'Isikizya', 275: 'Lumemo', 276: 'Idete', 277: 'Gumbiro', 278: 'Mchoteka', 279: 'Manchali', 280: 'Pandambili', 281: 'Kileo', 282: 'Uru Kusini Mawela', 283: 'Mhunze', 284: 'Usangule', 285: 'Kilagano', 286: 'Mafyeko', 287: 'Mogwa', 288: 'Oljoro', 289: 'Ngulla', 290: 'Legezamwendo', 291: 'Chome', 292: 'Ligera', 293: 'Maroroni', 294: 'Muriti', 295: 'Ipande', 296: 'Mwendakulima', 297: 'Shishiyu', 298: 'Kalenga', 299: 'Doma', 300: 'Mchesi', 301: 'Kirando', 302: 'Idunda', 303: 'Ulaya', 304: 'Nyakahura', 305: 'Lembeni', 306: 'Nyanzenda', 307: 'Msongola', 308: 'Kilosa', 309: 'Mabwerebwere', 310: 'Kirua Vunjo Kusini', 311: 'Tumbi', 312: 'Uru Kaskazini', 313: 'Selela', 314: 'Buhendangabo', 315: 'Mshewe', 316: 'Kindi', 317: 'Mamire', 318: 'Kimnyaki', 319: 'Ibadakuli', 320: 'Kitobo', 321: "Mang'ula", 322: 'Malangali', 323: 'Chita', 324: 'Ihango', 325: 'Maligisu', 326: 'Matendo', 327: 'Lufilyo', 328: 'Mnyawa', 329: 'Kikwe', 330: 'Kibaoni', 331: 'Usanda', 332: 'Kiwangwa', 333: 'Kikomero', 334: 'Muhange', 335: 'Olele', 336: 'Nainokanoka', 337: 'Kidodoma', 338: 'Rangwi', 339: 'Mtapenda', 340: 'Minepa', 341: 'Uwemba', 342: 'Mgungira', 343: 'Ketumbeine', 344: 'Salawe', 345: 'Kiranyi', 346: 'Mtitaa', 347: 'Mamboya', 348: 'Siha Magharibi', 349: 'Kifanya', 350: 'Jangwani', 351: 'Buyango', 352: 'Mgombasi', 353: 'Namwinyu', 354: 'Zinga/Ikerege', 355: 'Lupembe', 356: 'Badi', 357: 'Pemba Mnazi', 358: 'Keni Mengeni', 359: 'Murungu', 360: 'Komuge', 361: 'Muze', 362: 'Murufiti', 363: 'Kahororo', 364: 'Bushiri', 365: 'Mtinko', 366: 'Machame Magharibi', 367: 'Nkungulu', 368: 'Kolandoto', 369: 'Ufukoni', 370: 'Izazi', 371: 'Euga', 372: 'Gua', 373: 'Selambala', 374: 'Gendabi', 375: 'Ulenje', 376: 'Mlimba', 377: 'Visiga', 378: 'Isange', 379: 'Ifunda', 380: 'Bashinet', 381: 'Mahenge', 382: 'Mikalanga', 383: 'Ntuntu', 384: 'Irole', 385: 'Chimala', 386: 'Mahanje', 387: 'Madibira', 388: 'Kisiju', 389: 'Mamsera', 390: 'Njoge', 391: 'Mwangaza', 392: 'Mwandoya', 393: 'Soga', 394: 'Kigonsera', 395: 'Madilu', 396: 'Ikindwa', 397: 'Msisi', 398: 'Myovizi', 399: 'Fukalo', 400: 'Maore', 401: 'Nala', 402: 'Kidodi', 403: 'Muriba', 404: 'Kihurio', 405: 'Kiomboi', 406: 'Mwamalili', 407: 'Kintinku', 408: 'Mwakilyambiti', 409: 'Bukene', 410: 'Kate', 411: 'Nyakasasa', 412: 'Shume', 413: 'Kafunzo', 414: 'Mazinde', 415: 'Miono', 416: 'Kihonda', 417: 'Nditi', 418: 'Masuguru', 419: 'Idahina', 420: 'Masange', 421: 'Nangowe', 422: 'Mamba Kusini', 423: 'Somangira', 424: 'Ligunga', 425: 'Kitulo', 426: 'Magomeni', 427: 'Dunda', 428: 'Mvumi Mission', 429: 'Mshewa', 430: 'Mbuga', 431: 'Janda', 432: 'Magagura', 433: "King'ori", 434: 'Ruhembe', 435: 'Nanga', 436: 'Dindira', 437: 'Mchombe', 438: 'Myunga', 439: 'Songa', 440: 'Namabengo', 441: 'Rungemba', 442: 'Kabaragaine', 443: 'Tinde', 444: 'Kigondo', 445: 'Kurui', 446: 'Kyerwa', 447: 'Budekwa', 448: 'Ngimu', 449: 'Isagehe', 450: 'Ushirombo', 451: 'Kenyamonta', 452: 'Mkwaja', 453: 'Mwamala', 454: 'Nyambureti', 455: 'Pugu', 456: 'Mlangarini', 457: 'Mfumbi', 458: 'Mwabuma', 459: 'Katoro', 460: 'Ruaha', 461: 'Butiama', 462: 'Mwaniko', 463: 'Uru Mashariki', 464: 'Diongoya', 465: 'Kalulu', 466: 'Ntobo', 467: 'Nyimbili', 468: 'Machame Kaskazini', 469: 'Ikoma', 470: 'Mwabusalu', 471: 'Kinaga', 472: 'Kahamanhalaga', 473: 'Mphwayungu', 474: 'Bugene', 475: 'Chitete', 476: 'Hedaru', 477: 'Kakola', 478: 'Mpui', 479: 'Yombo', 480: 'Kimuli', 481: 'Bashay', 482: 'Lagangabilili', 483: 'Maweni', 484: 'Sunga', 485: 'Makuyuni', 486: 'Rubale', 487: 'Koromije', 488: 'Mgori', 489: 'Mtumba', 490: 'Farkwa', 491: 'Mlunduzi', 492: 'Kibosho Kati', 493: "Ring'wani", 494: 'Hunyari', 495: 'Tungamaa', 496: 'Budakwa', 497: 'Image', 498: 'Shinghatini', 499: 'Muungano', 500: 'Chanika', 501: 'Mzenga', 502: 'Kijungu', 503: 'Katwe', 504: 'Malinyi', 505: 'Rungwe Mpya', 506: 'Mkoma', 507: 'Mnero Miembeni', 508: 'Ilolo Mpya', 509: 'Missungwi', 510: 'Lupiro', 511: 'Mima', 512: 'Mvuha', 513: 'Kiegei', 514: 'Kitongo Sima', 515: 'Berege', 516: 'Igosi', 517: 'Mbangamao', 518: 'Mamba', 519: 'Ubangaa', 520: 'Pandagichiza', 521: 'Mji Mwema', 522: 'Kabungu', 523: 'Nyabusozi', 524: 'Mbwewe', 525: 'Mtambula', 526: 'Mpuguso', 527: 'Mbweni', 528: 'Igalula', 529: 'Kazunzu', 530: 'Buterankuzi', 531: 'Pangani Mashariki', 532: 'Nshamba', 533: 'Milogodi', 534: 'Buhingo', 535: 'Balangidalalu', 536: 'Bwambo', 537: 'Ubiri', 538: 'Machame Uroki', 539: 'Ikondo', 540: 'Mikunguni', 541: 'Bumilayinga', 542: 'Mninga', 543: 'Lunguya', 544: 'Chanzuru', 545: 'Mwanjoro', 546: 'Puma', 547: 'Mwamalole', 548: 'Gode Gode', 549: 'Masama Magharibi', 550: 'Moshono', 551: 'Sandulula', 552: 'Bugarama', 553: 'Mbinga Urban', 554: 'Inyonga', 555: 'Mtibwa', 556: 'Ruhunga', 557: 'Kiloli', 558: 'Kidegembye', 559: 'Kibirizi', 560: 'Magubike', 561: 'Nyamoko', 562: 'Lwangwa', 563: 'Makuro', 564: 'Solwa', 565: 'Ludewa', 566: 'Ruiwa', 567: 'Mnyuzi', 568: 'Ihanda', 569: 'Lusewa', 570: 'Bunda', 571: 'Lubugo', 572: 'Mwika Kaskazini', 573: 'Hai Urban', 574: 'Ngongwa', 575: 'Mbezi', 576: 'Mogitu', 577: 'Isongole', 578: 'Mombo', 579: 'Ngudu', 580: 'Sejeli', 581: 'Namasakata', 582: 'Monduli Juu', 583: 'Kihanga', 584: 'Iyogela', 585: 'Mtitu', 586: 'Dumila', 587: 'Mkomazi', 588: 'Kisangura', 589: 'Ihembe', 590: 'Masela', 591: 'Muriaza', 592: 'Soera', 593: 'Kisegese', 594: 'Kilelema', 595: 'Lusu', 596: 'Mpunze', 597: 'Mlangali', 598: 'Nondwa', 599: 'Kibedya', 600: 'Igongolo', 601: 'Ikhawoda', 602: 'Dabil', 603: 'Chokaa', 604: 'Nyamato', 605: 'Kwakoa', 606: 'Matai', 607: 'Bugorora', 608: 'Buhanda businde', 609: 'Sakasaka', 610: 'Kunduchi', 611: 'Maposeni', 612: 'Ilongero', 613: 'Kipanga', 614: 'Tai', 615: 'Matema', 616: 'Marangu Magharibi', 617: 'Manda', 618: 'Kisongo', 619: 'Yaeda Chini', 620: 'Maghang', 621: 'Karitu', 622: 'Igalla', 623: 'Mzizima', 624: 'Malenga Makali', 625: 'Mkwamba', 626: 'Kashasha', 627: 'Sadani', 628: 'Magoma', 629: 'Kalinzi', 630: 'Biirabo', 631: 'Kahe Mashariki', 632: 'Kongolo', 633: 'Sumve', 634: 'Chabutwa', 635: 'Mwandu', 636: 'Hungumalwa', 637: 'Riroda', 638: 'Nyigogo', 639: 'Busi', 640: 'Nishambya', 641: 'Mtina', 642: 'Buganguzi', 643: 'Mavanga', 644: 'Upuge', 645: 'Namikango', 646: 'Potwe', 647: 'Sangabuye', 648: 'Mbogwe', 649: 'Mahida Holili', 650: "Ng'hundi", 651: 'Nambambo', 652: 'Mkalamo', 653: 'Nkwenda', 654: 'Utengule Usongwe', 655: 'Magengati', 656: 'Ilolangulu', 657: 'Bwiro', 658: 'Lilambo', 659: 'Msata', 660: 'Mwankoko', 661: 'Pande', 662: 'Nyankanga', 663: 'Kisale Msangara', 664: 'Nyarero', 665: 'Luponde', 666: 'Mazwi', 667: 'Rwinga', 668: 'Litembo', 669: 'Mgwashi', 670: 'Nakatuguru', 671: 'Murangi', 672: 'Matiri', 673: 'Bukumbi', 674: 'Binagi', 675: 'Kalambazite', 676: 'Melela', 677: 'Bukombe', 678: 'Hogoro', 679: "Ilkiding'a", 680: 'Kasungamile', 681: 'Mpepai', 682: 'Kisesa', 683: 'Madunga', 684: 'Tumati', 685: 'Shelui', 686: 'Mpinji', 687: 'Orkesumet', 688: 'Kipande', 689: 'Luagala', 690: 'Katangara Mrere', 691: 'Gelai Meirugoi', 692: 'Itigi', 693: 'Kamachumu', 694: 'Kisemu', 695: 'Bulongwa', 696: 'Kasanga', 697: 'Shambarai', 698: 'Kulimi', 699: 'Mpanda Ndogo', 700: 'Kisorya', 701: 'Ihowanza', 702: 'Unyambwa', 703: 'Kilimatinde', 704: 'Gungu', 705: 'Ilela', 706: 'Bonde la Songwe', 707: 'Tingi', 708: 'Kirwa Keni', 709: 'Mtanana', 710: 'Mwagi', 711: 'Mtanila', 712: 'Kibengu', 713: 'Kibingo', 714: 'Kwamsisi', 715: 'Lufingo', 716: 'Mkuyuni', 717: "Munga'a", 718: 'Kimamba B', 719: 'Kasokola', 720: 'Mkata', 721: 'Nyarubungo', 722: 'Ngerengere', 723: 'Kadoto', 724: 'Mwimbi', 725: 'Ipala', 726: 'Issenye', 727: 'Iduo', 728: 'Mwandeti', 729: 'Chibe', 730: 'Igowole', 731: 'Mahembe', 732: 'Sofi', 733: 'Tabaruka', 734: 'Muzye', 735: 'Nyamuswa', 736: 'Isansa', 737: 'Bitale', 738: 'Magugu', 739: 'Mahuta', 740: 'Senga', 741: 'Dareda', 742: 'Simbo', 743: 'Kahama Urban', 744: 'Lake Jipe', 745: 'Huzi', 746: 'Biro', 747: 'Matimira', 748: 'Bupigu', 749: 'Kasanda', 750: 'Msingi', 751: 'Murutunguru', 752: 'Katuma', 753: 'Luegu', 754: 'Kinyeto', 755: 'Kashishi', 756: 'Kiloleli', 757: 'Itaka', 758: 'Itewe', 759: 'Kibamba', 760: 'Gisambalang', 761: 'Maruku', 762: 'Kolero', 763: 'Tuwemacho', 764: 'Lukanga', 765: 'Kitama', 766: 'Ughandi', 767: 'Arusha Chini', 768: 'Gwandi', 769: 'Kisaki', 770: 'Kyimo', 771: 'Mnero Ngongo', 772: 'Sukuma', 773: 'Buruma', 774: 'Mlambo-Itobo', 775: 'Pito', 776: 'Kanyangereko', 777: 'Olgosorok', 778: 'Nanjara Reha', 779: 'Ntwike', 780: 'Dongobesh', 781: 'Ngoma', 782: 'Bunju', 783: 'Kibakwe', 784: 'Lupata', 785: 'Kamuli', 786: 'Paranga', 787: 'Arash', 788: 'Kukirango', 789: 'Unyamikumbi', 790: 'Kilosa Mpepo', 791: 'Mbuyuni', 792: 'Endulen', 793: 'Kisuke', 794: 'Naberera', 795: 'Bulungwa', 796: 'Ndolwa', 797: 'Songambele', 798: 'Machame Mashariki', 799: 'Chipanga', 800: 'Singisi', 801: 'Nyabubinza', 802: 'Nyamhula', 803: 'Lukokoda', 804: 'Wasa', 805: 'Gare', 806: 'Chiola', 807: 'Nyamtukuza', 808: 'Busangi', 809: 'Mtae', 810: 'Kisese', 811: 'Songoro', 812: 'Kerenge', 813: 'Iragua', 814: 'Luhunga', 815: 'Kitanda', 816: 'Matekwe', 817: 'Mnyamba', 818: 'Nyamilama', 819: 'Ndungu', 820: 'Ikunguigazi', 821: 'Lingeka', 822: 'Nzuguni', 823: 'Nyabibuye', 824: 'Oldonyosambu', 825: 'Nata', 826: 'Igongwa', 827: 'Lyenje', 828: 'Masagati', 829: 'Vudee', 830: 'Mwamishali', 831: 'Nyambono', 832: 'Kimamba A', 833: 'Marambo', 834: 'Bunamhala', 835: 'Mapogoro', 836: 'Iguguno', 837: 'Mikumi', 838: 'Sali', 839: 'Nzihi', 840: 'Makiidi', 841: 'Kasharu', 842: 'Mtoa', 843: 'Buchambi', 844: 'Kajunjumele', 845: 'Nyambiti', 846: 'Ilunda', 847: 'Chiwezi', 848: 'Bombo', 849: 'Mambali', 850: 'Sopa', 851: 'Nsimbo', 852: 'Lumuli', 853: 'Magu Urban', 854: 'Kanyenye', 855: 'Zombo', 856: 'Shimbi', 857: 'Nguvu Moja', 858: 'Idodi', 859: 'Lusungo', 860: 'Sanje', 861: 'Mwanhunzi', 862: 'Sanu', 863: 'Nyakonga', 864: 'Magole', 865: 'Mungumaji', 866: 'Wotta', 867: 'Iniho', 868: 'Sungaji', 869: 'Ukiriguru', 870: 'Rwabere', 871: 'Bweranyange', 872: 'Kigunga', 873: 'Masagalu', 874: 'Msangano', 875: 'Busagara', 876: 'Ikama', 877: 'Ikinga', 878: 'Masama Kusini', 879: 'Kisanga', 880: 'Mingumbi', 881: 'Mpunguzi', 882: 'Kilimani', 883: 'Mbesa', 884: 'Dabaga', 885: 'Ikola', 886: 'Levolosi', 887: 'Koryo', 888: 'Kelamfua Mokala', 889: 'Tembela', 890: 'Sepuko', 891: 'Lusanga', 892: 'Kabasa', 893: 'Leguruki', 894: 'Kizumbi', 895: 'Kingale', 896: 'Nyakitonto', 897: 'Igoweko', 898: 'Nyamidaho', 899: 'Misezero', 900: 'Nyamatare', 901: 'Matamba', 902: 'Mbuzii', 903: 'Ipole', 904: 'Biharamulo Urban', 905: 'Lushoto', 906: "Dung'unyi", 907: 'Lundi', 908: 'Kikunde', 909: 'Kandete', 910: 'Kambasegese', 911: 'Njombe Urban', 912: 'Mwamabanza', 913: 'Mgandu', 914: 'Kazazi', 915: 'Same Urban', 916: 'Mwaru', 917: 'Kwamtoro', 918: 'Mantare', 919: 'Pahi', 920: 'Luhundwa', 921: 'Nyamimange', 922: 'Bumbuta', 923: 'Kakonko', 924: 'Kibosho Magharibi', 925: 'Mwada', 926: 'Ipera', 927: 'Iponya', 928: 'Mubunda', 929: 'Bendera', 930: 'Kilema Kusini', 931: 'Misha', 932: 'Rabuor', 933: 'Igamba', 934: 'Bassotu', 935: 'Pasua', 936: 'Uru Shimbwe', 937: 'Mkonze', 938: 'Mwakanshashala', 939: 'Ngokolo', 940: 'Ubetu Kahe', 941: 'Manyoni', 942: 'Kamsamba', 943: 'Kizengi', 944: 'Igwamanoni', 945: 'Maboga', 946: 'Mlowo', 947: 'Tutuo', 948: 'Kinyagiri', 949: 'Buhoro', 950: 'Siha Mashariki', 951: 'Msowero', 952: 'Ligoma', 953: 'Masakta', 954: 'Ukwama', 955: 'Nalasi', 956: 'Mbutu', 957: 'Makambako', 958: 'Karema', 959: 'Nyamatongo', 960: 'Butuguri', 961: 'Mbulu Urban', 962: 'Milambo', 963: 'Nkilizya', 964: 'Zongomera', 965: 'Pinyinyi', 966: 'Bangata', 967: 'Machochwe', 968: 'Ngonga', 969: 'Majengo', 970: 'Kisarawe II', 971: 'Ndama', 972: 'Luduga', 973: 'Wariku', 974: 'Katumba', 975: 'Mkundi', 976: 'Ugunga', 977: 'Mbuji', 978: "Rung'bure", 979: 'Old Moshi Mashariki', 980: 'Mlandizi', 981: 'Nyakasungwa', 982: 'Suma', 983: 'Longido', 984: 'Bwakila Chini', 985: 'Soni', 986: 'Ugogoni', 987: 'Ngana', 988: 'Igombavanu', 989: 'Partimbo', 990: 'Katerero', 991: 'Kahangara', 992: 'Ushiri Ikuini', 993: 'Bukanda', 994: 'Gelai Lumbwa', 995: 'Arri', 996: 'Bwiregi', 997: 'Kibati', 998: 'Lugunga', 999: 'Majeleko', 1000: 'Sazira', 1001: 'Igusule', 1002: 'Kibondo', 1003: 'Katoma', 1004: 'Maundo', 1005: 'Mtua', 1006: 'Mlali', 1007: 'Ipinda', 1008: 'Mbelekese', 1009: 'Kasololo', 1010: 'Ishunju', 1011: 'Rigicha', 1012: 'Myangayanga', 1013: 'Kandaga', 1014: 'Galula', 1015: 'Dalai', 1016: 'Bujashi', 1017: 'Luchili', 1018: 'Nyoni', 1019: 'Mihugi', 1020: 'Kala', 1021: 'Sokoni II', 1022: 'Uyumbu', 1023: 'Luilo', 1024: 'Mhonda', 1025: 'Ipuli', 1026: 'Chandama', 1027: 'Ulemo', 1028: 'Isale', 1029: 'Nyamagaro', 1030: 'Matomondo', 1031: 'Masama Rundugai', 1032: 'Lunguza', 1033: 'Kihagara', 1034: 'Lalago', 1035: 'Qash', 1036: 'Kinampanda', 1037: 'Bukongo', 1038: 'Loibor Siret', 1039: 'Kinyerezi', 1040: 'Ngoheranga', 1041: 'Ndono', 1042: 'Nyakibimbili', 1043: 'Mollo', 1044: 'Goweko', 1045: 'Segese', 1046: 'Kimbe', 1047: 'Kisumwa', 1048: 'Lalta', 1049: 'Segala', 1050: 'Bupamwa', 1051: 'Kingolwira', 1052: 'Ngujini', 1053: 'Mngonya', 1054: 'Kilima Rondo', 1055: 'Matare', 1056: 'Nangwa', 1057: 'Kilago', 1058: 'Ikongolo', 1059: 'Ukumbisiganga', 1060: 'Mtipa', 1061: 'Nanhyanga', 1062: 'Makanya', 1063: 'Mwazye', 1064: 'Saja', 1065: 'Kazaroho', 1066: 'Chikola', 1067: 'Nyalikungu', 1068: 'Msitu wa Tembo', 1069: 'Digodigo', 1070: 'Ilonga', 1071: 'Keni  Alleni', 1072: 'Ruvuma', 1073: 'Kigwe', 1074: 'Nkoaranga', 1075: 'Kwaluguru', 1076: 'Kasela', 1077: 'Kwagunda', 1078: 'Kwadelo', 1079: 'Iwawa', 1080: 'Mwera', 1081: 'Nyandoto', 1082: 'Kumsenga', 1083: 'Olmolog', 1084: 'Kichangani', 1085: 'Kishanje', 1086: 'Old Sumbawanga', 1087: 'Buhongwa', 1088: 'Kisawasawa', 1089: 'Chemba', 1090: 'Haidom', 1091: 'Rugu', 1092: 'Mpiruka', 1093: 'Chemchem', 1094: 'Kapilula', 1095: 'Ngomeni', 1096: 'Matogoro', 1097: "Ng'hoboko", 1098: 'Kigombe', 1099: 'Kisarawe', 1100: 'Masaba', 1101: 'Msimbu', 1102: 'Busongo', 1103: 'Kisaka', 1104: 'Ruhuwiko', 1105: 'Mkotokuyana', 1106: 'Shishani', 1107: 'Makongorosi', 1108: 'Terrat', 1109: 'Sasajila', 1110: 'Kapele', 1111: 'Igurubi', 1112: 'Kalamba', 1113: 'Engusero', 1114: 'Itunundu', 1115: 'Mtekente', 1116: 'Machame Kusini', 1117: 'Wala', 1118: 'Tlawi', 1119: 'Mugumu Urban', 1120: 'Ilembo', 1121: 'Kirima', 1122: 'Rugenge', 1123: 'Nzega Urban', 1124: 'Ikuwo', 1125: 'Mbamba bay', 1126: 'Mwabuzo', 1127: 'Magazini', 1128: 'Kingerikiti', 1129: 'Mvumi Makulu', 1130: 'Heru- Ushingo', 1131: 'Mkuranga', 1132: 'Gera', 1133: 'Mwongozo', 1134: 'Wino', 1135: 'Rudi', 1136: 'Uduka', 1137: 'Halungu', 1138: 'Kyambahi', 1139: 'Isevya', 1140: 'Kilangali', 1141: 'Kilomeni', 1142: 'Msogezi', 1143: 'Iwiji', 1144: 'Mngazi', 1145: 'Busole', 1146: 'Ngulinguli', 1147: 'Ndevelwa', 1148: 'Nyakato', 1149: 'Mlowa', 1150: 'Kipagalo', 1151: 'Mpapa', 1152: 'Vuje', 1153: 'Ijombe', 1154: 'Chongoleani', 1155: 'Iyunga mapinduzi', 1156: 'Sanjaranda', 1157: 'Usule', 1158: 'Nkoko', 1159: 'Haubi', 1160: 'Ruzinga', 1161: 'Igunga', 1162: 'Nsunga', 1163: 'Mamba Kaskazini', 1164: 'Ulanda', 1165: 'Esilalei', 1166: 'Lyoma', 1167: 'Udekwa', 1168: 'Mkange', 1169: 'Bereko', 1170: 'Iyogwe', 1171: 'Ukondamoyo', 1172: 'Motamburu Kitendani', 1173: 'Utende', 1174: 'Igombe Mkulu', 1175: 'Ndala', 1176: 'Ihalimba', 1177: 'Michenjele', 1178: 'Goba', 1179: 'Runzewe', 1180: 'Nkaiti', 1181: 'Kitahana', 1182: 'Buhigwe', 1183: 'Malya', 1184: 'Kwediboma', 1185: 'Itaba', 1186: 'Itiso', 1187: 'Nyehunge', 1188: 'Sepeko', 1189: 'Kaliua', 1190: 'Dongo', 1191: 'Kinole', 1192: 'Mpalanga', 1193: 'Roche', 1194: 'Sanzawa', 1195: 'Igogo', 1196: 'Buhemba', 1197: 'Bukima', 1198: 'Kyebitembe', 1199: 'Butimba', 1200: 'Busawe', 1201: 'Lizaboni', 1202: 'Chomvu', 1203: 'Mbondo', 1204: 'Idifu', 1205: 'Kikore', 1206: 'Likombe', 1207: 'Mtenga', 1208: 'Usisya', 1209: 'Kabwe', 1210: 'Muhukuru', 1211: 'Chilulumo', 1212: 'Chamkoroma', 1213: 'Daudi', 1214: 'Muhunga', 1215: 'Bonga', 1216: 'Mkonjowano', 1217: 'Kitomondo', 1218: 'Vigwaza', 1219: 'Tununguo', 1220: 'Isandula', 1221: 'Luana', 1222: 'Makutopora', 1223: 'Ludende', 1224: 'Ngorongoro', 1225: 'Tanga', 1226: 'Mbalamaziwa', 1227: 'Sunuka', 1228: 'Matanga', 1229: 'Makiba', 1230: 'Milingoti Mashariki', 1231: 'Chibelela', 1232: 'Nyabiyonza', 1233: 'Masanze', 1234: 'Kalebezo', 1235: 'Mshangano', 1236: 'Kanindo', 1237: 'Etaro', 1238: 'Bungu', 1239: 'Msamalo', 1240: 'Mbingu', 1241: 'Sigino', 1242: 'Chekelei', 1243: 'Bugwema', 1244: 'Nguruka', 1245: 'Iwungilo', 1246: 'Nampungu', 1247: 'Bunduki', 1248: 'Chingungwe', 1249: 'Ikuti', 1250: 'Mji Mpya', 1251: 'Mailimoja', 1252: 'Bukwe', 1253: "Makang'wa", 1254: 'Malindi', 1255: 'Kolo', 1256: 'Malambo', 1257: 'Itinje', 1258: 'Vunta', 1259: 'Nyasato', 1260: 'Mwamanongu', 1261: 'Rubuga', 1262: 'Turwa', 1263: 'Kidaru', 1264: 'Kaselya', 1265: 'Mateves', 1266: 'Kikombo', 1267: 'Nyathorogo', 1268: 'Ibihwa', 1269: 'Gehandu', 1270: 'Mabama', 1271: 'Ibiri', 1272: 'Malunga', 1273: 'Kwala', 1274: 'Kwafungo', 1275: 'Daraja Mbili', 1276: 'Mlola', 1277: 'Gairo', 1278: 'Manzase', 1279: 'Kemambo', 1280: 'Itobo', 1281: 'Chalangwa', 1282: 'Rudewa', 1283: 'Naipingo', 1284: 'Mtamaa', 1285: 'Katandala', 1286: 'Mtombozi', 1287: 'Shungubweni', 1288: 'Vigaeni', 1289: 'Nyamunga', 1290: 'Tamota', 1291: 'Suruke', 1292: 'Manga', 1293: 'Magindu', 1294: 'Kibosho Mashariki', 1295: 'Chona', 1296: 'Mcharo', 1297: 'Idamba', 1298: 'Kisangasa', 1299: 'Duru', 1300: 'Gidas', 1301: 'Burungura', 1302: 'Zuzu', 1303: 'Muleba', 1304: 'Ushokola', 1305: 'Vumilia', 1306: 'Mikoni', 1307: 'Bukabwa', 1308: 'Mkolani', 1309: 'Talawanda', 1310: 'Bwembera', 1311: 'Kilimilire', 1312: 'Lugarawa', 1313: 'Nakapanya', 1314: 'Mazae', 1315: 'Mdimba Mnyoma', 1316: 'Ukwega', 1317: 'Ngenge', 1318: 'Bahi', 1319: 'Engaruka', 1320: 'Mwangoye', 1321: 'Ngwala', 1322: 'Kinyala', 1323: 'Malampaka', 1324: 'Nyakakika', 1325: 'Tandahimba', 1326: 'Nyamwaga', 1327: 'Masieda', 1328: 'Soit Sambu', 1329: 'Jangalo', 1330: 'Ibwera', 1331: 'Mihambwe', 1332: 'Lituhi', 1333: 'Mafinga', 1334: 'Uyovu', 1335: 'Kibindu', 1336: 'Goima', 1337: 'Lumuma', 1338: 'Gitting', 1339: 'Sekei', 1340: 'Mkoka', 1341: 'Luswisi', 1342: 'Kwamndolwa', 1343: 'Kashaulili', 1344: 'Nyakahanga', 1345: 'Litumbandyosi', 1346: 'Kafule', 1347: 'Kanyigo', 1348: 'Uyui', 1349: 'Ngwelo', 1350: 'Misechela', 1351: 'Ulowa', 1352: 'Bugomora', 1353: 'Maguu', 1354: 'Mirare', 1355: 'Maretadu', 1356: 'Korogwe', 1357: 'Ufuluma', 1358: 'Kambikatoto', 1359: 'Kinamapula', 1360: 'Igurwa', 1361: 'Ngombezi', 1362: 'Namikupa', 1363: 'Idetemya', 1364: 'Ivuna', 1365: 'Chilionwa', 1366: 'Mwitikira', 1367: 'Rungwa', 1368: 'Lubiga', 1369: 'Marungu', 1370: 'Magiri', 1371: 'Kitongoni', 1372: 'Kifula', 1373: 'Chela', 1374: 'Chiwanda', 1375: 'Kighare', 1376: 'Ndogosi', 1377: 'Titye', 1378: "Nkomang'ombe", 1379: 'Sumbugu', 1380: 'Ifumbo', 1381: 'Iselamagazi', 1382: 'Iseni', 1383: 'Berega', 1384: 'Wampelembe', 1385: 'Ndedo', 1386: 'Nyahongo', 1387: 'Bulige', 1388: 'Nkinto', 1389: 'Itumba', 1390: 'Ilunde', 1391: 'Izigo', 1392: 'Musa', 1393: 'Engarenaibor', 1394: 'Langiro', 1395: 'Tegeruka', 1396: 'Mazimbu', 1397: 'Vuga', 1398: 'Mwamanga', 1399: 'Mabira', 1400: 'Milanzi', 1401: 'Ijumbi', 1402: 'Imalamakoye', 1403: 'Chirombola', 1404: 'Mnazi', 1405: 'Igandu', 1406: 'Isseke', 1407: 'Nduli', 1408: 'Ibugule', 1409: 'Karenge', 1410: 'Bungulwa', 1411: 'Haneti', 1412: 'Kasuga', 1413: 'Majiri', 1414: 'Ilindi', 1415: 'Mbaha', 1416: 'Kihangimahuka', 1417: 'Usinge', 1418: 'Nkoarisambu', 1419: "Kyang'ombe", 1420: 'Chaume', 1421: 'Issuwa', 1422: 'Nkiniziwa', 1423: 'Isagenhe', 1424: 'Mwanga Urban', 1425: 'Mhongolo', 1426: 'Mtego wa Noti', 1427: 'Yakobi', 1428: 'Igalala', 1429: 'Kibanga', 1430: 'Lumbiji', 1431: 'Sima', 1432: 'Mugunzu', 1433: 'Naputa', 1434: 'Lukumbule', 1435: 'Namilembe', 1436: 'Baleni', 1437: 'Ikowa', 1438: 'Malolo', 1439: 'Kipara Mnero', 1440: 'Itundu', 1441: 'Kishanda', 1442: 'Mlowa Bwawani', 1443: 'Namagondo', 1444: 'Kalunde', 1445: 'Nyakalilo', 1446: 'Usagali', 1447: 'Iringa Mvumi', 1448: 'Ziba', 1449: 'Rushwa', 1450: 'Mrijo', 1451: 'Mkulazi', 1452: 'Kirongo Samanga', 1453: 'Pangwi', 1454: 'Ubenazamozi', 1455: 'Bweni', 1456: 'Irisya', 1457: 'Kayanga', 1458: 'Dosidosi', 1459: 'Ushetu', 1460: 'Mkula', 1461: 'Shangani', 1462: 'Uhambingeto', 1463: "Mwang'halanga", 1464: 'Itonjanda', 1465: 'Kwasunga', 1466: 'Poli', 1467: 'Kilema kaskazini', 1468: 'Bumbuli', 1469: 'Bulyakashaju', 1470: 'Uwanja wa Ndege', 1471: 'Olbalbal', 1472: 'Kiloleni', 1473: 'Vigoi', 1474: 'Suguti', 1475: 'Vihingo', 1476: 'Zoissa', 1477: 'Mletele', 1478: 'Lyabukande', 1479: 'Misozwe', 1480: 'Chamazi', 1481: 'Sokon I', 1482: 'Mtii', 1483: 'Magamba', 1484: 'Malibwi', 1485: 'Madanga', 1486: 'Kagera Nkanda', 1487: 'Buyagu', 1488: 'Bangwe', 1489: 'Mvungwe', 1490: 'Sombetini', 1491: 'Iramba', 1492: 'Kabula', 1493: 'Ilujamate', 1494: 'Matemanga', 1495: 'Santilya', 1496: 'Ibumi', 1497: 'Makanda', 1498: 'Kasiriri', 1499: 'Ruponda', 1500: 'Utiri', 1501: 'Gumanga', 1502: 'Litisha', 1503: 'Sindeni', 1504: 'Mwabomba', 1505: 'Lionja', 1506: 'Ussoke', 1507: 'Semembela', 1508: 'Misalai', 1509: 'Vugiri', 1510: 'Machimboni', 1511: 'Kakunyu', 1512: 'Langali', 1513: 'Utwigu', 1514: 'Ishozi', 1515: 'Chikongola', 1516: 'Makame', 1517: 'Mpondo', 1518: 'Tongi', 1519: 'Kaagya', 1520: 'Subira', 1521: 'Gidahababieg', 1522: 'Rubafu', 1523: 'Nyamrandirira', 1524: 'Nsemulwa', 1525: 'Aghondi', 1526: 'Tchenzema', 1527: 'Bigwa', 1528: 'Mwakitolyo', 1529: 'Mitengo', 1530: 'Mgambo', 1531: "Mang'onyi", 1532: 'Kagoma', 1533: 'Ilagala', 1534: 'Misima', 1535: 'Mtindiro', 1536: 'Sitalike', 1537: 'Urwira', 1538: "Kining'inila", 1539: 'Katunguru', 1540: 'Msanja', 1541: 'Lolkisale', 1542: 'Bukundi', 1543: "Ng'ambo", 1544: "Ngh'ong'ona", 1545: 'Tulya', 1546: 'Makojo', 1547: 'Nansimo', 1548: 'Kiomoni', 1549: 'Loya', 1550: 'Puge', 1551: 'Kituntu', 1552: 'Mondo', 1553: 'Lukande', 1554: 'Kibasuka', 1555: 'Namatula', 1556: 'Lamaiti', 1557: 'Unga Ltd', 1558: 'Muhutwe', 1559: 'Bujugo', 1560: 'Mpendo', 1561: 'Kagunga', 1562: 'Engutoto', 1563: 'Kagondo', 1564: 'Kijima', 1565: 'Kizara', 1566: 'Bukumi', 1567: 'Kasense', 1568: 'Kibaigwa', 1569: 'Mofu', 1570: 'Mdago', 1571: 'Naiyobi', 1572: 'Tongoni', 1573: 'Nzega ndogo', 1574: 'Kyengege', 1575: 'Mambwekenya', 1576: 'Ngima', 1577: 'Kwamkonje', 1578: 'Shitage', 1579: 'Ngulu', 1580: 'Milepa', 1581: 'Serengeti National Park', 1582: 'Misughaa', 1583: 'Sirop', 1584: 'Chali', 1585: 'Kipeta', 1586: 'Shilalo', 1587: 'Nhundulu', 1588: 'Bugandika', 1589: 'Old Moshi Magharibi', 1590: 'Idodyandole', 1591: 'Imalaseko', 1592: 'Salama', 1593: 'Milingoti Magharibi', 1594: 'Mafinzi', 1595: 'Bukiko', 1596: 'Massa', 1597: 'Kanyelele', 1598: 'Magata/Karutanga', 1599: 'Buzilasoga', 1600: 'Elerai', 1601: 'Sanza', 1602: 'Mkumbi', 1603: 'Ngaya', 1604: 'Baraa', 1605: 'Mlalo', 1606: 'Luwalaje', 1607: 'Laghanga', 1608: 'Uyogo', 1609: 'Msangeni', 1610: 'Nyakatende', 1611: 'Kyanyari', 1612: 'Shigamba', 1613: 'Themi', 1614: 'Olbolot', 1615: 'Handali', 1616: 'Hirbadaw', 1617: 'Kibada', 1618: 'Tarekea Motamburu', 1619: 'Kimwani', 1620: 'Mgama', 1621: 'Matongo', 1622: 'Vijibweni', 1623: 'Bwakila Juu', 1624: 'Gongoni', 1625: 'Kasimbu', 1626: 'Pemba', 1627: 'Sale', 1628: 'Stesheni', 1629: 'Wela', 1630: 'Uchindile', 1631: 'Itilo', 1632: 'Kilakala', 1633: 'Kapalala', 1634: 'Ovada', 1635: 'Karanga', 1636: 'Masqaroda', 1637: 'Kitendaguru', 1638: 'Funta', 1639: 'Kirya', 1640: 'Izia', 1641: 'Kitura', 1642: 'Mwamashiga', 1643: 'Pangale', 1644: 'Naipanga', 1645: 'Ruvu Remit', 1646: 'Bassodesh', 1647: 'Mwawaza', 1648: 'Mpambala', 1649: 'Makorongo', 1650: 'Isakamaliwa', 1651: 'Olorien', 1652: 'Mundemu', 1653: 'Kiyowela', 1654: 'Bwanjai', 1655: 'Mlondwe', 1656: 'Urughu', 1657: 'Loibor Soit', 1658: 'Mwanga Kaskazini', 1659: 'Mpamantwa', 1660: 'Nghambaku', 1661: 'Kainam', 1662: 'Usevya', 1663: 'Longuo', 1664: 'Lugata', 1665: 'Sungwisi', 1666: 'Kwedizinga', 1667: 'Bukoko', 1668: 'Rusaba', 1669: 'Tingeni', 1670: 'Uyowa', 1671: 'Railway', 1672: 'Minyughe', 1673: 'Ukata', 1674: 'Changaa', 1675: 'Namapwia', 1676: 'Lutindi', 1677: 'Minziro', 1678: 'Mugango', 1679: 'Bwasi', 1680: 'Mkuzi', 1681: 'Isanzu', 1682: 'Tambuka Reli', 1683: 'Mtonya', 1684: 'Msalato', 1685: 'Bwagamoyo', 1686: 'Murongo', 1687: 'Ngarenaro', 1688: 'Kaloleni', 1689: 'Mwisi', 1690: 'Rau', 1691: 'Ndomoni', 1692: 'Membe', 1693: 'Kitembe', 1694: 'Kibaya', 1695: 'Kipili', 1696: 'Mbede', 1697: 'Mwese', 1698: 'Tambukareli', 1699: 'Negero', 1700: 'Mkwiti', 1701: 'Kiboriloni', 1702: 'Mwangoi', 1703: 'Mindu', 1704: 'Vinghawe', 1705: 'Bumera', 1706: 'Tunduma', 1707: 'Ijunganyondo', 1708: 'Ndembezi', 1709: 'Mkindi', 1710: 'Mwembesongo', 1711: 'Pangani Magharibi', 1712: 'Shanwe', 1713: 'Lemara', 1714: 'Zirai', 1715: 'Kiusa', 1716: 'Mhezi', 1717: 'Ichemba', 1718: 'Isaka', 1719: 'Chisano', 1720: 'Masukulu', 1721: 'Sigili', 1722: 'Kakesio', 1723: 'Lutende', 1724: 'Machinjioni', 1725: 'Kitangili', 1726: 'Ijanija', 1727: 'Itunduru', 1728: 'Bukindo', 1729: 'Goribe', 1730: 'Nyarokoba', 1731: 'Lupa tingatinga', 1732: 'Chifunfu', 1733: 'Lubiri', 1734: 'Kawajense', 1735: 'Mhande', 1736: 'Misunkumilo', 1737: 'Magoroto', 1738: 'Jaila', 1739: "Kimang'a", 1740: 'Kilulu', 1741: 'Matui', 1742: 'Kiluvya', 1743: 'Itandula', 1744: 'Kitete', 1745: 'Kitanga', 1746: 'Matwiga', 1747: 'Maskati', 1748: 'Kyamuraile', 1749: 'Oldonyo Sambu', 1750: 'Bukura', 1751: 'Kasingirima', 1752: 'Moita', 1753: 'Bondeni', 1754: 'Fufu', 1755: 'Ikweha', 1756: 'Murray', 1757: 'Mizibaziba', 1758: 'Busisi', 1759: 'Ifinga', 1760: 'Kimandolu', 1761: 'Boma', 1762: 'Simbay', 1763: "Ng'haya", 1764: 'Masasi', 1765: 'Bombambili', 1766: 'Matarawe', 1767: 'Chuno', 1768: 'Isuto', 1769: 'Thawi', 1770: 'Kikubiji', 1771: 'Budushi', 1772: 'Rusimbi', 1773: 'Nyanungu', 1774: 'Kirongo', 1775: 'Ugala', 1776: 'Mkwajuni', 1777: 'Mawenzi', 1778: 'Saunyi', 1779: 'Heka Azimio', 1780: 'Ilula', 1781: 'Mlimani', 1782: 'Bureza', 1783: 'Manyondwe', 1784: 'Korongoni', 1785: 'Mfaranyaki', 1786: 'Chinugulu', 1787: 'Nyamtinga', 1788: 'Kinungu'}
Mapeo inverso para 'scheme_management': {1: 'VWC', 2: 'Other', 3: 'WUG', 4: 'Water Board', 5: 'WUA', 6: 'Water authority', 7: 'Company', 8: 'Private operator', 9: 'Parastatal', 10: 'Trust', 11: 'SWC'}
Mapeo inverso para 'recorded_by': {1: 'GeoData Consultants Ltd'}
Mapeo inverso para 'scheme_name': {1: 'Roman', 2: 'K', 3: 'Rare_17_20', 4: 'Rare_3_7', 5: "wanging'ombe water supply s", 6: 'Rare_26_30', 7: 'Rare_31_35', 8: 'Rare_less_3', 9: 'Rare_41_45', 10: 'Rare_141_160', 11: 'Rare_21_25', 12: 'Chalinze wate', 13: 'Rare_101_120', 14: 'Ngana water supplied scheme', 15: 'Rare_13_16', 16: 'Rare_91_100', 17: 'Kirua kahe pumping water trust', 18: 'Rare_8_12', 19: 'Rare_56_64', 20: 'Machumba estate pipe line', 21: 'Rare_51_55', 22: 'Makwale water supplied sche', 23: 'Kirua kahe gravity water supply trust', 24: 'Rare_81_90', 25: 'Mkongoro One', 26: 'Maambreni gravity water supply', 27: 'M', 28: "wanging'ombe supply scheme", 29: 'Rare_36_40', 30: 'Rare_71_80', 31: 'Bagamoyo wate', 32: 'Kijiji', 33: 'Rare_161_180', 34: 'Losaa-Kia water supply', 35: 'N', 36: 'Rare_65_70', 37: 'Rare_46_50', 38: 'S', 39: 'Rare_181_200', 40: 'DANIDA', 41: 'Rare_121_140', 42: 'Borehole', 43: 'Mkongoro Two', 44: 'Government', 45: 'Nabaiye pipe line', 46: 'mtwango water supply scheme', 47: 'Lyamungo umbwe water supply', 48: "Uroki-Bomang'ombe water sup", 49: 'I', 50: 'Handeni Trunk Main(H', 51: 'Ki'}
Mapeo inverso para 'extraction_type': {1: 'gravity', 2: 'swn 80', 3: 'nira/tanira', 4: 'india mark ii', 5: 'other', 6: 'ksb', 7: 'submersible', 8: 'mono', 9: 'windmill', 10: 'afridev', 11: 'other - rope pump', 12: 'other - swn 81', 13: 'cemo', 14: 'climax', 15: 'other - play pump', 16: 'walimi', 17: 'india mark iii', 18: 'other - mkulima/shinyanga'}
Mapeo inverso para 'extraction_type_group': {1: 'gravity', 2: 'swn 80', 3: 'nira/tanira', 4: 'india mark ii', 5: 'other', 6: 'submersible', 7: 'mono', 8: 'wind-powered', 9: 'afridev', 10: 'rope pump', 11: 'other handpump', 12: 'other motorpump', 13: 'india mark iii'}
Mapeo inverso para 'extraction_type_class': {1: 'gravity', 2: 'handpump', 3: 'other', 4: 'submersible', 5: 'motorpump', 6: 'wind-powered', 7: 'rope pump'}
Mapeo inverso para 'management': {1: 'vwc', 2: 'wug', 3: 'other', 4: 'water board', 5: 'wua', 6: 'company', 7: 'private operator', 8: 'water authority', 9: 'parastatal', 10: 'unknown', 11: 'other - school', 12: 'trust'}
Mapeo inverso para 'management_group': {1: 'user-group', 2: 'other', 3: 'commercial', 4: 'parastatal', 5: 'unknown'}
Mapeo inverso para 'payment': {1: 'pay annually', 2: 'never pay', 3: 'pay per bucket', 4: 'unknown', 5: 'pay when scheme fails', 6: 'other', 7: 'pay monthly'}
Mapeo inverso para 'payment_type': {1: 'annually', 2: 'never pay', 3: 'per bucket', 4: 'unknown', 5: 'on failure', 6: 'other', 7: 'monthly'}
Mapeo inverso para 'water_quality': {1: 'soft', 2: 'milky', 3: 'salty', 4: 'unknown', 5: 'fluoride', 6: 'coloured', 7: 'salty abandoned', 8: 'fluoride abandoned'}
Mapeo inverso para 'quality_group': {1: 'good', 2: 'milky', 3: 'salty', 4: 'unknown', 5: 'fluoride', 6: 'colored'}
Mapeo inverso para 'quantity': {1: 'enough', 2: 'insufficient', 3: 'seasonal', 4: 'dry', 5: 'unknown'}
Mapeo inverso para 'quantity_group': {1: 'enough', 2: 'insufficient', 3: 'seasonal', 4: 'dry', 5: 'unknown'}
Mapeo inverso para 'source': {1: 'spring', 2: 'rainwater harvesting', 3: 'dam', 4: 'machine dbh', 5: 'shallow well', 6: 'river', 7: 'hand dtw', 8: 'lake', 9: 'unknown', 10: 'other'}
Mapeo inverso para 'source_type': {1: 'spring', 2: 'rainwater harvesting', 3: 'dam', 4: 'borehole', 5: 'shallow well', 6: 'river/lake', 7: 'other'}
Mapeo inverso para 'source_class': {1: 'groundwater', 2: 'surface', 3: 'unknown'}
Mapeo inverso para 'waterpoint_type': {1: 'communal standpipe', 2: 'communal standpipe multiple', 3: 'hand pump', 4: 'other', 5: 'improved spring', 6: 'cattle trough', 7: 'dam'}
Mapeo inverso para 'waterpoint_type_group': {1: 'communal standpipe', 2: 'hand pump', 3: 'other', 4: 'improved spring', 5: 'cattle trough', 6: 'dam'}
Mapeo inverso para 'status_group': {1: 'functional', 2: 'non functional', 3: 'functional needs repair'}
Mapeo inverso para 'population_category': {1: 'Medium population', 2: 'High population', 3: 'No population'}
Mapeo inverso para 'amount_tsh_category': {1: 'High water availability', 2: 'No water point', 3: 'Low water availability', 4: 'Medium water availability'}
Mapeo inverso para 'public_meeting': {1: True, 2: False}
Mapeo inverso para 'permit': {1: False, 2: True}
In [100]:
# Recodificar las etiquetas de 1, 2, 3 a 0, 1, 2
train_cleaned['status_group'] = train_cleaned['status_group'].map({1: 0, 2: 1, 3: 2})

# Verificar las etiquetas recodificadas
print("Clases únicas en status_group recodificado:", train_cleaned['status_group'].unique())
Clases únicas en status_group recodificado: [0 1 2]
In [101]:
train_cleaned
Out[101]:
id funder gps_height installer longitude latitude wpt_name num_private basin subvillage ... source_type source_class waterpoint_type waterpoint_type_group status_group day month year population_category amount_tsh_category
0 69572 1 1390 1 34.938093 -9.856322 1 0 1 1 ... 1 1 1 1 0 14 3 2011 1 1
1 8776 2 1399 2 34.698766 -2.147466 2 0 2 1 ... 2 2 1 1 0 6 3 2013 2 2
2 34310 3 686 3 37.460664 -3.821329 3 0 3 2 ... 3 2 2 1 0 25 2 2013 2 3
4 19728 3 0 4 31.130847 -1.825359 1 0 2 3 ... 2 2 1 1 0 13 7 2011 3 2
6 19816 4 0 3 33.362410 -3.766365 4 0 4 3 ... 4 1 3 2 1 1 10 2012 3 2
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
59395 60739 33 1210 17 37.169807 -3.253847 4 0 3 3 ... 1 1 1 1 0 3 5 2013 1 3
59396 27263 7 1212 8 35.249991 -9.070629 4 0 6 6 ... 6 2 1 1 0 7 5 2011 1 1
59397 37057 3 0 8 34.017087 -8.750434 3 0 6 3 ... 4 1 3 2 0 11 4 2011 3 2
59398 31282 3 0 8 35.861315 -6.378573 4 0 6 5 ... 5 1 3 2 0 8 3 2011 3 2
59399 26348 38 191 4 38.104048 -6.747464 4 0 7 4 ... 5 1 3 2 0 23 3 2011 1 2

51249 rows × 43 columns

In [102]:
list_correlacion = ['funder', 'gps_height', 'installer', 'longitude', 'latitude',
       'wpt_name', 'basin','population_category','district_code','public_meeting', 'scheme_management',
       'permit','construction_year','extraction_type','amount_tsh_category','management','payment','water_quality','quantity','source','waterpoint_type',
        'day','month','year','status_group']

df_correlacion = train_cleaned[list_correlacion]
correlation_mat = df_correlacion.corr()
sns.heatmap(correlation_mat)
plt.show()
No description has been provided for this image
In [103]:
# Lista de columnas que deseas conservar
# Existe correlacion entre variable debido a que hay distintas vairables que son lo mismo
columnas_deseadas = ['id','funder', 'gps_height', 'installer', 'longitude', 'latitude',
                     'wpt_name', 'basin', 'population_category', 'district_code', 'public_meeting', 
                     'scheme_management', 'permit', 'construction_year', 'extraction_type', 
                     'amount_tsh_category', 'management', 'payment', 'water_quality', 'quantity', 
                     'source', 'waterpoint_type', 'day', 'month', 'year', 'status_group']

# Seleccionar solo las columnas deseadas en tu DataFrame actual
train_cleaned = train_cleaned.loc[:, columnas_deseadas]

# Verificar que se hayan seleccionado correctamente las columnas
print(train_cleaned.head())

# También puedes guardar este DataFrame con las columnas seleccionadas si lo deseas
# train_cleaned_selected.to_csv('train_cleaned_selected.csv', index=False)
      id  funder  gps_height  installer  longitude  latitude  wpt_name  basin  \
0  69572       1        1390          1  34.938093 -9.856322         1      1   
1   8776       2        1399          2  34.698766 -2.147466         2      2   
2  34310       3         686          3  37.460664 -3.821329         3      3   
4  19728       3           0          4  31.130847 -1.825359         1      2   
6  19816       4           0          3  33.362410 -3.766365         4      4   

   population_category  district_code  ...  management  payment  \
0                    1              5  ...           1        1   
1                    2              2  ...           2        2   
2                    2              4  ...           1        3   
4                    3              1  ...           3        2   
6                    3              3  ...           1        2   

   water_quality  quantity  source  waterpoint_type  day  month  year  \
0              1         1       1                1   14      3  2011   
1              1         2       2                1    6      3  2013   
2              1         1       3                2   25      2  2013   
4              1         3       2                1   13      7  2011   
6              1         1       4                3    1     10  2012   

   status_group  
0             0  
1             0  
2             0  
4             0  
6             1  

[5 rows x 26 columns]
In [104]:
train_cleaned.dtypes
Out[104]:
id                       int64
funder                   int64
gps_height               int64
installer                int64
longitude              float64
latitude               float64
wpt_name                 int64
basin                    int64
population_category      int64
district_code            int64
public_meeting           int64
scheme_management        int64
permit                   int64
construction_year        int64
extraction_type          int64
amount_tsh_category      int64
management               int64
payment                  int64
water_quality            int64
quantity                 int64
source                   int64
waterpoint_type          int64
day                      int32
month                    int32
year                     int32
status_group             int64
dtype: object
In [105]:
train_cleaned.set_index('id', inplace=True)
In [106]:
corr = abs(train_cleaned.corr())
corr[['status_group']].sort_values(by = 'status_group',ascending = False).style.background_gradient()
Out[106]:
  status_group
status_group 1.000000
quantity 0.255018
construction_year 0.162461
waterpoint_type 0.098579
source 0.086977
gps_height 0.083267
longitude 0.082890
water_quality 0.069012
management 0.058448
public_meeting 0.056803
basin 0.053278
amount_tsh_category 0.052169
payment 0.045472
day 0.045155
extraction_type 0.044296
population_category 0.042843
wpt_name 0.041353
latitude 0.036566
district_code 0.033489
funder 0.032684
scheme_management 0.029733
installer 0.025804
year 0.022842
permit 0.013652
month 0.012924
In [107]:
def proporciones_final (var,target,df):

    proporcion = pd.DataFrame()
    
    proporcion['%fugas'] = df[target].groupby(df[var]).mean()*100
    proporcion['Conteo'] = df[target].groupby(df[var]).count()
    proporcion= proporcion.round(3)   
    proporcion_filtered = proporcion[(proporcion['%fugas']>0) & (proporcion['Conteo']>10)]  
        
    if len(proporcion_filtered)<100 and len(proporcion_filtered)>1:
        fig = plt.figure()
        ax = proporcion_filtered['Conteo'].plot(kind='bar',grid=True)
                
        ax2 = ax.twinx()
        ax2.plot(proporcion_filtered['%fugas'].values, linestyle='-', linewidth=2.0,color='g')
        plt.ylim(0, 100) # modificación.
        plt.tight_layout()   
        
    else:        
        proporcion_filtered.reset_index(inplace=True)
        sns.lmplot(x = var,y ='%fugas', data=proporcion_filtered,fit_reg=True,ci=None)
In [108]:
for i in train_cleaned.columns:
    proporciones_final (i, 'status_group', train_cleaned)
C:\Users\Fran\.conda\Lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight
  self._figure.tight_layout(*args, **kwargs)
C:\Users\Fran\.conda\Lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight
  self._figure.tight_layout(*args, **kwargs)
C:\Users\Fran\.conda\Lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight
  self._figure.tight_layout(*args, **kwargs)
C:\Users\Fran\AppData\Local\Temp\ipykernel_18340\3214928767.py:11: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`). Consider using `matplotlib.pyplot.close()`.
  fig = plt.figure()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [109]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize

def saca_metricas(y_real, y_pred_proba, classes):
    y_real_bin = label_binarize(y_real, classes=classes)
    n_classes = y_real_bin.shape[1]

    fpr = dict()
    tpr = dict()
    roc_auc = dict()

    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_real_bin[:, i], y_pred_proba[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # Calcular AUC promedio (macro)
    roc_auc_macro = roc_auc_score(y_real_bin, y_pred_proba, average='macro')
    print(f'- AUC (macro): {roc_auc_macro:.2f}')

    # Graficar las curvas ROC para cada clase
    plt.figure()
    for i in range(n_classes):
        plt.plot(fpr[i], tpr[i], label=f'Class {classes[i]} (AUC = {roc_auc[i]:.2f})')

    plt.plot([0, 1], [0, 1], 'r--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve - Multiclass')
    plt.legend(loc='lower right')
    plt.show()

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(
    train_cleaned.drop('status_group', axis=1),
    train_cleaned.status_group,
    test_size=0.2,
    random_state=0,
    stratify=train_cleaned.status_group
)

# Entrenar y evaluar el modelo de árbol de decisión
tree_mod = DecisionTreeClassifier(criterion="gini").fit(X_train, y_train)
tree_pred_proba = tree_mod.predict_proba(X_test)
saca_metricas(y_test, tree_pred_proba, classes=[0, 1, 2])

# Entrenar y evaluar el modelo de bosque aleatorio
classifier2 = RandomForestClassifier().fit(X_train, y_train)
pred2_proba = classifier2.predict_proba(X_test)
saca_metricas(y_test, pred2_proba, classes=[0, 1, 2])

# Entrenar y evaluar el modelo LinearSVC
classifier3 = LinearSVC(penalty='l1', dual=False).fit(X_train, y_train)
pred3_decision = classifier3.decision_function(X_test)
from scipy.special import softmax
pred3_proba = softmax(pred3_decision, axis=1)
saca_metricas(y_test, pred3_proba, classes=[0, 1, 2])

# Entrenar y evaluar el modelo LogisticRegression
classifier4 = LogisticRegression(max_iter=10000).fit(X_train, y_train)
pred4_proba = classifier4.predict_proba(X_test)
saca_metricas(y_test, pred4_proba, classes=[0, 1, 2])
- AUC (macro): 0.73
No description has been provided for this image
- AUC (macro): 0.89
No description has been provided for this image
- AUC (macro): 0.72
C:\Users\Fran\.conda\Lib\site-packages\sklearn\svm\_base.py:1237: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  warnings.warn(
No description has been provided for this image
- AUC (macro): 0.73
C:\Users\Fran\.conda\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
No description has been provided for this image
In [110]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd

# Define RandomForestClassifier
classifier2 = RandomForestClassifier()

# Realizar validación cruzada
cv = cross_val_score(
    classifier2,
    X_train, 
    y_train,
    scoring='roc_auc_ovo', 
    cv=5
)

print(cv)
print("CV ROC (media):", np.mean(cv))
print("CV ROC (desviación estándar):", np.std(cv))

# Ajustar classifier2 con todos los datos de entrenamiento
classifier2.fit(X_train, y_train)

# Calcular y mostrar las importancias de las características
imp = {}
for i in range(len(X_train.columns)):
    imp[X_train.columns[i]] = [classifier2.feature_importances_[i]]

# Mostrar las 10 características principales por importancia
top_features = pd.DataFrame.from_dict(imp, orient="index", columns=["Importancia"]).sort_values("Importancia", ascending=False).head(10)
top_features.style.background_gradient()
[0.87160865 0.87386408 0.85795465 0.86236074 0.87740752]
CV ROC (media): 0.868639129741956
CV ROC (desviación estándar): 0.007301769547195616
Out[110]:
  Importancia
quantity 0.128269
longitude 0.123040
latitude 0.119366
construction_year 0.076605
gps_height 0.074234
day 0.055811
waterpoint_type 0.054575
extraction_type 0.045595
funder 0.041175
installer 0.032926
In [111]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import numpy as np

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(
    train_cleaned.drop('status_group', axis=1),
    train_cleaned.status_group,
    test_size=0.2,
    random_state=0,
    stratify=train_cleaned.status_group
)

# Definir el modelo RandomForestClassifier y los parámetros de la búsqueda en cuadrícula
param_grid = {
    'n_estimators': [10, 50 ,100 ],
    'criterion': ['gini', 'entropy'],
    'max_depth': [16, 20],
    'max_features': ['log2', 'sqrt',None]
}

stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=11)

model_grid = GridSearchCV(
    estimator=RandomForestClassifier(),
    param_grid=param_grid,
    scoring='accuracy',
    cv=stratified_kfold,
    n_jobs=-1
)

# Entrenar el modelo con GridSearchCV
model_grid.fit(X_train, y_train)

# Obtener los mejores parámetros y el mejor score
print("Mejores parámetros encontrados:")
print(model_grid.best_params_)
print("Mejor score encontrado (accuracy):")
print(model_grid.best_score_)

# Evaluar el modelo en el conjunto de prueba
tree_pred_proba = model_grid.predict_proba(X_test)

# Convertir las predicciones de probabilidad a etiquetas
tree_pred_labels = np.argmax(tree_pred_proba, axis=1)

# Mostrar métricas globales
accuracy = accuracy_score(y_test, tree_pred_labels)
precision = precision_score(y_test, tree_pred_labels, average='macro')
recall = recall_score(y_test, tree_pred_labels, average='macro')
f1 = f1_score(y_test, tree_pred_labels, average='macro')

print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1-score: {f1:.2f}')

# Mostrar el informe de clasificación
print("Informe de clasificación:")
print(classification_report(y_test, tree_pred_labels))
Mejores parámetros encontrados:
{'criterion': 'entropy', 'max_depth': 20, 'max_features': 'log2', 'n_estimators': 100}
Mejor score encontrado (accuracy):
0.8010439494405922
Accuracy: 0.80
Precision: 0.72
Recall: 0.64
F1-score: 0.66
Informe de clasificación:
              precision    recall  f1-score   support

           0       0.79      0.91      0.85      5682
           1       0.84      0.74      0.79      3860
           2       0.53      0.25      0.34       708

    accuracy                           0.80     10250
   macro avg       0.72      0.64      0.66     10250
weighted avg       0.79      0.80      0.79     10250

In [112]:
classifier2 = RandomForestClassifier(criterion = 'gini', n_estimators= 100, max_depth = 20, max_features= 'sqrt').fit(X_train, y_train)
pred2 = classifier2.predict(X_test)
In [113]:
print(classification_report(y_test,pred2))
              precision    recall  f1-score   support

           0       0.79      0.91      0.85      5682
           1       0.84      0.75      0.79      3860
           2       0.54      0.24      0.33       708

    accuracy                           0.80     10250
   macro avg       0.73      0.63      0.66     10250
weighted avg       0.80      0.80      0.79     10250

In [114]:
from imblearn.over_sampling import SMOTE

print("Before OverSampling, counts of label '2': {}".format(sum(train_cleaned.status_group==2)))
print("Before OverSampling, counts of label '1': {}".format(sum(train_cleaned.status_group==1)))
print("Before OverSampling, counts of label '0': {}".format(sum(train_cleaned.status_group==0)))

# Aplicar SMOTE para balancear las clases
sm = SMOTE(random_state=2)
train_cleaned_res, y_res = sm.fit_resample(train_cleaned.drop('status_group', axis=1), train_cleaned.status_group.ravel())

print('After OverSampling, the shape of train_X: {}'.format(train_cleaned_res.shape))
print('After OverSampling, the shape of train_y: {}'.format(y_res.shape))

print("After OverSampling, counts of label '2': {}".format(sum(y_res==2)))
print("After OverSampling, counts of label '1': {}".format(sum(y_res==1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_res==0)))
Before OverSampling, counts of label '2': 3539
Before OverSampling, counts of label '1': 19298
Before OverSampling, counts of label '0': 28412
After OverSampling, the shape of train_X: (85236, 24)
After OverSampling, the shape of train_y: (85236,)
After OverSampling, counts of label '2': 28412
After OverSampling, counts of label '1': 28412
After OverSampling, counts of label '0': 28412
In [115]:
from sklearn.metrics import roc_curve, auc
import numpy as np

def saca_metricas(y_real, y_pred_proba, classes):
    n_classes = len(classes)
    y_real_bin = np.zeros((len(y_real), n_classes))
    for i, c in enumerate(classes):
        y_real_bin[:, i] = (y_real == c).astype(int)
    
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    
    # Compute ROC curve and ROC area for each class
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_real_bin[:, i], y_pred_proba[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])
    
    # Compute macro-average ROC AUC
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(n_classes):
        mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])
    mean_tpr /= n_classes
    
    roc_auc_macro = auc(all_fpr, mean_tpr)
    
    # Print results
    for i in range(n_classes):
        print(f"Clase {classes[i]} - ROC AUC: {roc_auc[i]}")
    print(f"ROC AUC Promedio Macro: {roc_auc_macro}")
In [116]:
X_train_res, X_test_res, y_train_res, y_test_res = train_test_split (train_cleaned_res, 
                                                     y_res,
                                                     test_size = 0.2,
                                                     random_state = 0,
                                                     stratify = y_res)
In [117]:
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold

# Definición y entrenamiento del clasificador RandomForestClassifier
classifier5 = RandomForestClassifier(criterion='gini', n_estimators=500, max_depth=20, max_features='sqrt')
classifier5.fit(X_train_res, y_train_res)

# Validación cruzada para obtener predicciones de probabilidad
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
y_pred_cv = cross_val_predict(classifier5, train_cleaned_res, y_res, cv=skf, method='predict_proba')

# Predicciones en los datos de prueba
pred5 = classifier5.predict_proba(X_test_res)
pred_labels = np.argmax(pred5, axis=1) 
print("Classification Report en datos de prueba:")
print(classification_report(y_test_res, pred_labels))

# Evaluación usando la función saca_metricas en los datos de prueba
print("\nMétricas ROC AUC en datos de prueba:")
saca_metricas(y_test_res, pred5, classes=[0, 1, 2])
Classification Report en datos de prueba:
              precision    recall  f1-score   support

           0       0.84      0.83      0.83      5683
           1       0.87      0.80      0.83      5683
           2       0.86      0.95      0.90      5682

    accuracy                           0.86     17048
   macro avg       0.86      0.86      0.86     17048
weighted avg       0.86      0.86      0.86     17048


Métricas ROC AUC en datos de prueba:
Clase 0 - ROC AUC: 0.9512060073114998
Clase 1 - ROC AUC: 0.951165550438364
Clase 2 - ROC AUC: 0.9809235018785223
ROC AUC Promedio Macro: 0.961133692998321

Análisis comparativo:¶

  1. Accuracy, Precision, Recall y F1-score (macro):
  • Los valores mejoran significativamente después de aplicar SMOTE. Eindicaiere que el balanceo de clases ha ayudado al modelo a predecir mejor las clases minoritarias (como la clase 2 en este caso), lo que se refleja en métricas más equilibradas y altas.
  1. ROC AUC:
  • En el caso del ROC AUC, también vemos mejoras en los valores después del balanceo. Los valores altos de ROC AUC indican que el modelo es capaz de distinguir eficazmente entre las clases.
  1. Importancia del balanceo:
  • El balanceo de datos ha permitido al modelo aprender de manera más equilibrada de todas las clases, evitando así el sesgo hacia las clases mayoritarias que podría ocurrir en datos desbConclusinos:

Basándosue en estoodemosltados, puedes concluir que el balanceo de datos ha llevado a una mejora signifrendimientoel ldimiento de tu modelo RandomForestClassifier. Los resultados en términos de métricas de clasificación y ROC AUC son más altos y más equilibrados después de aplicar SMOTE, lo que indica que el modelo está mejorando en la capacidad de generalización y en la precisión de las predicciones para todas las clases.

In [119]:
classifier5.feature_importances_
Out[119]:
array([0.04462241, 0.06790051, 0.03649861, 0.11483131, 0.10315681,
       0.02593995, 0.02853173, 0.01682492, 0.03500414, 0.00612336,
       0.02111286, 0.01207471, 0.06351403, 0.05581014, 0.02083057,
       0.02072964, 0.03150336, 0.01112212, 0.10246199, 0.03618159,
       0.05032203, 0.05023259, 0.03249225, 0.01217838])
In [120]:
sorted_idx = classifier5.feature_importances_.argsort()
plt.barh(X_train_res.columns, classifier5.feature_importances_[sorted_idx])
plt.xlabel("Random Forest Feature Importance")
Out[120]:
Text(0.5, 0, 'Random Forest Feature Importance')
No description has been provided for this image
In [121]:
feat_importances = pd.Series(classifier5.feature_importances_, index=X_train_res.columns)
feat_importances.sort_values().plot(kind='barh')
Out[121]:
<Axes: >
No description has been provided for this image

Pre proceso de limpieza para la data test¶

Se repiten las mismas modificaciones

In [122]:
test_values
Out[122]:
id amount_tsh date_recorded funder gps_height installer longitude latitude wpt_name num_private ... payment_type water_quality quality_group quantity quantity_group source source_type source_class waterpoint_type waterpoint_type_group
0 50785 0.0 2013-02-04 Dmdd 1996 DMDD 35.290799 -4.059696 Dinamu Secondary School 0 ... never pay soft good seasonal seasonal rainwater harvesting rainwater harvesting surface other other
1 51630 0.0 2013-02-04 Government Of Tanzania 1569 DWE 36.656709 -3.309214 Kimnyak 0 ... never pay soft good insufficient insufficient spring spring groundwater communal standpipe communal standpipe
2 17168 0.0 2013-02-01 NaN 1567 NaN 34.767863 -5.004344 Puma Secondary 0 ... never pay soft good insufficient insufficient rainwater harvesting rainwater harvesting surface other other
3 45559 0.0 2013-01-22 Finn Water 267 FINN WATER 38.058046 -9.418672 Kwa Mzee Pange 0 ... unknown soft good dry dry shallow well shallow well groundwater other other
4 49871 500.0 2013-03-27 Bruder 1260 BRUDER 35.006123 -10.950412 Kwa Mzee Turuka 0 ... monthly soft good enough enough spring spring groundwater communal standpipe communal standpipe
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
14845 39307 0.0 2011-02-24 Danida 34 Da 38.852669 -6.582841 Kwambwezi 0 ... never pay soft good enough enough river river/lake surface communal standpipe communal standpipe
14846 18990 1000.0 2011-03-21 Hiap 0 HIAP 37.451633 -5.350428 Bonde La Mkondoa 0 ... annually salty salty insufficient insufficient shallow well shallow well groundwater hand pump hand pump
14847 28749 0.0 2013-03-04 NaN 1476 NaN 34.739804 -4.585587 Bwawani 0 ... never pay soft good insufficient insufficient dam dam surface communal standpipe communal standpipe
14848 33492 0.0 2013-02-18 Germany 998 DWE 35.432732 -10.584159 Kwa John 0 ... never pay soft good insufficient insufficient river river/lake surface communal standpipe communal standpipe
14849 68707 0.0 2013-02-13 Government Of Tanzania 481 Government 34.765054 -11.226012 Kwa Mzee Chagala 0 ... never pay soft good dry dry spring spring groundwater communal standpipe communal standpipe

14850 rows × 40 columns

In [123]:
test_values.isnull().sum()
Out[123]:
id                          0
amount_tsh                  0
date_recorded               0
funder                    870
gps_height                  0
installer                 877
longitude                   0
latitude                    0
wpt_name                    0
num_private                 0
basin                       0
subvillage                 99
region                      0
region_code                 0
district_code               0
lga                         0
ward                        0
population                  0
public_meeting            821
recorded_by                 0
scheme_management         969
scheme_name              7242
permit                    737
construction_year           0
extraction_type             0
extraction_type_group       0
extraction_type_class       0
management                  0
management_group            0
payment                     0
payment_type                0
water_quality               0
quality_group               0
quantity                    0
quantity_group              0
source                      0
source_type                 0
source_class                0
waterpoint_type             0
waterpoint_type_group       0
dtype: int64
In [124]:
# Convertir la columna 'date_recorded' a datetime
test_values['date_recorded'] = pd.to_datetime(test_values['date_recorded'], format='%Y-%m-%d')

# Extraer día, mes y año en columnas separadas
test_values['day'] = test_values['date_recorded'].dt.day
test_values['month'] = test_values['date_recorded'].dt.month
test_values['year'] = test_values['date_recorded'].dt.year

# Eliminar la columna original 'date_recorded' si ya no es necesaria
test_values.drop(columns='date_recorded', inplace=True)
In [125]:
cat_cols= test_values.select_dtypes(include=['object','category']).columns
num_cols = test_values.select_dtypes(exclude=['object','category']).columns
In [126]:
import pandas as pd
import numpy as np

# Supongamos que `test_values` es tu DataFrame con la columna 'funder'

# Calcular la proporción de cada categoría sobre el total de observaciones no nulas
total_non_null = test_values['funder'].dropna().count()
proportions = test_values['funder'].value_counts() / total_non_null

# Obtener la cantidad de valores nulos
total_nulls = test_values['funder'].isnull().sum()

# Multiplicar las proporciones por el total de valores nulos
imputed_nulls = np.round(proportions * total_nulls).astype(int)

# Ajustar para asegurar que la suma sea igual al total de valores nulos
imputed_nulls_delta = total_nulls - imputed_nulls.sum()
if imputed_nulls_delta > 0:
    first_non_zero_idx = imputed_nulls[imputed_nulls > 0].index[0]
    imputed_nulls[first_non_zero_idx] += imputed_nulls_delta

# Crear una lista con las categorías según las cantidades calculadas
impute_values = []
for funder, count in imputed_nulls.items():
    impute_values.extend([funder] * count)

# Si hay una discrepancia, ajustar la lista impute_values
if len(impute_values) > total_nulls:
    impute_values = impute_values[:total_nulls]
elif len(impute_values) < total_nulls:
    additional_values = np.random.choice(proportions.index, size=total_nulls - len(impute_values), p=proportions.values)
    impute_values.extend(additional_values)

# Reemplazar los valores nulos en la columna 'funder'
null_indices = test_values['funder'][test_values['funder'].isnull()].index
test_values.loc[null_indices, 'funder'] = impute_values

# Mostrar los resultados
print("Total de valores nulos:", total_nulls)
print("Proporciones de cada categoría:")
print(proportions)
print("Cantidad imputada a cada categoría:")
print(imputed_nulls)
print("Valores imputados:")
print(test_values['funder'].value_counts())
Total de valores nulos: 870
Proporciones de cada categoría:
funder
Government Of Tanzania    0.158441
Danida                    0.056724
Hesawa                    0.041488
World Bank                0.025179
Kkkt                      0.024034
                            ...   
Craelius                  0.000072
Tanroad                   0.000072
Ndanda Missions           0.000072
Wards                     0.000072
Livin                     0.000072
Name: count, Length: 979, dtype: float64
Cantidad imputada a cada categoría:
funder
Government Of Tanzania    226
Danida                     49
Hesawa                     36
World Bank                 22
Kkkt                       21
                         ... 
Craelius                    0
Tanroad                     0
Ndanda Missions             0
Wards                       0
Livin                       0
Name: count, Length: 979, dtype: int32
Valores imputados:
funder
Government Of Tanzania    2441
Danida                     842
Hesawa                     616
World Bank                 374
Kkkt                       357
                          ... 
Craelius                     1
Tanroad                      1
Ndanda Missions              1
Wards                        1
Livin                        1
Name: count, Length: 979, dtype: int64
In [127]:
import pandas as pd
import numpy as np

# Supongamos que `test_values` es tu DataFrame original con la columna `funder`

# Calcular las frecuencias de las categorías
funder_counts = test_values['funder'].value_counts()

# Definir los tramos de frecuencia
rare_1_9 = funder_counts[funder_counts < 10].index
rare_10_49 = funder_counts[(funder_counts >= 10) & (funder_counts < 50)].index
rare_50_99 = funder_counts[(funder_counts >= 50) & (funder_counts < 100)].index

# Reemplazar las categorías según los tramos definidos
test_values['funder'] = test_values['funder'].apply(
    lambda x: 'Rare_1_9' if x in rare_1_9 else 
              ('Rare_10_49' if x in rare_10_49 else 
               ('Rare_50_99' if x in rare_50_99 else x))
)

# Mostrar los resultados
print(test_values['funder'].value_counts())
funder
Rare_10_49                2854
Government Of Tanzania    2441
Rare_50_99                1787
Rare_1_9                  1694
Danida                     842
Hesawa                     616
World Bank                 374
Kkkt                       357
Rwssp                      349
World Vision               336
Unicef                     284
Tasaf                      275
Dhv                        251
Private Individual         221
0                          216
Dwsp                       214
District Council           206
Norad                      195
Water                      166
Germany Republi            165
Ministry Of Water          147
Tcrs                       141
Hifab                      135
Netherlands                130
Dwe                        129
Lga                        110
Adb                        109
Amref                      106
Name: count, dtype: int64
In [128]:
import pandas as pd
import numpy as np


# Calcular las proporciones de cada categoría sobre el total de observaciones no nulas
total_non_null = test_values['installer'].dropna().count()
proportions = test_values['installer'].value_counts() / total_non_null

# Obtener la cantidad de valores nulos
total_nulls = test_values['installer'].isnull().sum()

# Multiplicar las proporciones por el total de valores nulos
imputed_nulls = np.round(proportions * total_nulls).astype(int)

# Ajustar para asegurar que la suma sea igual al total de valores nulos
imputed_nulls_delta = total_nulls - imputed_nulls.sum()
if imputed_nulls_delta > 0:
    first_non_zero_idx = imputed_nulls[imputed_nulls > 0].index[0]
    imputed_nulls[first_non_zero_idx] += imputed_nulls_delta

# Crear una lista con las categorías según las cantidades calculadas
impute_values = []
for installer, count in imputed_nulls.items():
    impute_values.extend([installer] * count)

# Si hay una discrepancia, ajustar la lista impute_values
if len(impute_values) > total_nulls:
    impute_values = impute_values[:total_nulls]
elif len(impute_values) < total_nulls:
    additional_values = np.random.choice(proportions.index, size=total_nulls - len(impute_values), p=proportions.values)
    impute_values.extend(additional_values)

# Reemplazar los valores nulos en la columna 'installer'
null_indices = test_values['installer'][test_values['installer'].isnull()].index
test_values.loc[null_indices, 'installer'] = impute_values

# Mostrar los resultados
print("Total de valores nulos:", total_nulls)
print("Proporciones de cada categoría:")
print(proportions)
print("Cantidad imputada a cada categoría:")
print(imputed_nulls)
print("Valores imputados:")
print(test_values['installer'].value_counts())
Total de valores nulos: 877
Proporciones de cada categoría:
installer
DWE           0.311243
Government    0.032706
RWE           0.020897
Commu         0.020540
DANIDA        0.018249
                ...   
Commun        0.000072
QUIKWIN       0.000072
SDG           0.000072
MoWI          0.000072
LIVI          0.000072
Name: count, Length: 1091, dtype: float64
Cantidad imputada a cada categoría:
installer
DWE           352
Government     29
RWE            18
Commu          18
DANIDA         16
             ... 
Commun          0
QUIKWIN         0
SDG             0
MoWI            0
LIVI            0
Name: count, Length: 1091, dtype: int32
Valores imputados:
installer
DWE           4701
Government     486
RWE            310
Commu          305
DANIDA         271
              ... 
Commun           1
QUIKWIN          1
SDG              1
MoWI             1
LIVI             1
Name: count, Length: 1091, dtype: int64
In [129]:
import pandas as pd

# Supongamos que `test_values` es tu DataFrame original con la columna `installer`

# Calcular las frecuencias de las categorías
installer_counts = test_values['installer'].value_counts()

# Definir los tramos de frecuencia
rare_menor_5 = installer_counts[installer_counts < 5].index
rare_5_20 = installer_counts[(installer_counts >= 5) & (installer_counts <= 20)].index
rare_menor_50_mayor_20 = installer_counts[(installer_counts > 20) & (installer_counts < 50)].index
rare_50_100 = installer_counts[(installer_counts >= 50) & (installer_counts <= 100)].index
rare_101_150 = installer_counts[(installer_counts > 100) & (installer_counts <= 150)].index
rare_151_200 = installer_counts[(installer_counts > 150) & (installer_counts <= 200)].index
rare_201_500 = installer_counts[(installer_counts > 200) & (installer_counts <= 500)].index

# Reemplazar las categorías según los tramos definidos
test_values['installer'] = test_values['installer'].apply(
    lambda x: 'Rare_menor_5' if x in rare_menor_5 else 
              ('Rare_5_20' if x in rare_5_20 else 
               ('Rare_menor_50_mayor_20' if x in rare_menor_50_mayor_20 else 
                ('Rare_50_100' if x in rare_50_100 else 
                 ('Rare_101_150' if x in rare_101_150 else 
                  ('Rare_151_200' if x in rare_151_200 else 
                   ('Rare_201_500' if x in rare_201_500 else x))))))
)

# Mostrar los resultados
print(test_values['installer'].value_counts())
installer
DWE                       4701
Rare_201_500              2068
Rare_menor_50_mayor_20    2027
Rare_5_20                 1748
Rare_50_100               1434
Rare_menor_5              1262
Rare_101_150              1103
Rare_151_200               507
Name: count, dtype: int64
In [130]:
import pandas as pd
import numpy as np

# Supongamos que `test_values` es tu DataFrame original con la columna `subvillage`

# Calcular la proporción de cada categoría sobre el total de observaciones no nulas
total_non_null = test_values['subvillage'].dropna().count()
proportions = test_values['subvillage'].value_counts() / total_non_null

# Obtener la cantidad de valores nulos
total_nulls = test_values['subvillage'].isnull().sum()

# Multiplicar las proporciones por el total de valores nulos
imputed_nulls = np.round(proportions * total_nulls).astype(int)

# Ajustar para asegurar que la suma sea igual al total de valores nulos
imputed_nulls_delta = total_nulls - imputed_nulls.sum()
if imputed_nulls_delta > 0:
    first_non_zero_idx = imputed_nulls[imputed_nulls > 0].index[0]
    imputed_nulls[first_non_zero_idx] += imputed_nulls_delta

# Crear una lista con las categorías según las cantidades calculadas
impute_values = []
for subvillage, count in imputed_nulls.items():
    impute_values.extend([subvillage] * count)

# Si hay una discrepancia, ajustar la lista impute_values
if len(impute_values) > total_nulls:
    impute_values = impute_values[:total_nulls]
elif len(impute_values) < total_nulls:
    additional_values = np.random.choice(proportions.index, size=total_nulls - len(impute_values), p=proportions.values)
    impute_values.extend(additional_values)

# Reemplazar los valores nulos en la columna 'subvillage'
null_indices = test_values['subvillage'][test_values['subvillage'].isnull()].index
test_values.loc[null_indices, 'subvillage'] = impute_values

# Mostrar los resultados
print("Total de valores nulos:", total_nulls)
print("Proporciones de cada categoría:")
print(proportions)
print("Cantidad imputada a cada categoría:")
print(imputed_nulls)
print("Valores imputados:")
print(test_values['subvillage'].value_counts())
Total de valores nulos: 99
Proporciones de cada categoría:
subvillage
Shuleni           0.009491
Majengo           0.008745
Madukani          0.008203
Kati              0.006372
Sokoni            0.004203
                    ...   
Kurukangara       0.000068
Kiranjeranje A    0.000068
Mwajiji           0.000068
Ngabobo           0.000068
Kamba             0.000068
Name: count, Length: 8443, dtype: float64
Cantidad imputada a cada categoría:
subvillage
Shuleni           96
Majengo            1
Madukani           1
Kati               1
Sokoni             0
                  ..
Kurukangara        0
Kiranjeranje A     0
Mwajiji            0
Ngabobo            0
Kamba              0
Name: count, Length: 8443, dtype: int32
Valores imputados:
subvillage
Shuleni           236
Majengo           130
Madukani          122
Kati               95
Sokoni             62
                 ... 
Kurukangara         1
Kiranjeranje A      1
Mwajiji             1
Ngabobo             1
Kamba               1
Name: count, Length: 8443, dtype: int64
In [131]:
import pandas as pd
import numpy as np

# Supongamos que `test_values` es tu DataFrame original con la columna `subvillage`

# Calcular las frecuencias de las categorías
subvillage_counts = test_values['subvillage'].value_counts()

# Definir los tramos de frecuencia
rare_1 = subvillage_counts[subvillage_counts == 1].index
rare_2_4 = subvillage_counts[(subvillage_counts > 1) & (subvillage_counts < 5)].index
rare_5_10 = subvillage_counts[(subvillage_counts >= 5) & (subvillage_counts <= 10)].index
rare_11_20 = subvillage_counts[(subvillage_counts >= 11) & (subvillage_counts <= 20)].index
rare_21_50 = subvillage_counts[(subvillage_counts >= 21) & (subvillage_counts <= 50)].index
rare_51_100 = subvillage_counts[(subvillage_counts >= 51) & (subvillage_counts <= 100)].index
rare_101_plus = subvillage_counts[subvillage_counts > 100].index

# Reemplazar las categorías según los tramos definidos
test_values['subvillage'] = test_values['subvillage'].apply(
    lambda x: 'Rare_1' if x in rare_1 else 
              ('Rare_2_4' if x in rare_2_4 else 
               ('Rare_5_10' if x in rare_5_10 else 
                ('Rare_11_20' if x in rare_11_20 else 
                 ('Rare_21_50' if x in rare_21_50 else 
                  ('Rare_51_100' if x in rare_51_100 else 
                   ('Rare_101_plus' if x in rare_101_plus else x))))))
)

# Mostrar los resultados
print(test_values['subvillage'].value_counts())
subvillage
Rare_1           5964
Rare_2_4         5146
Rare_5_10        1596
Rare_11_20        734
Rare_21_50        649
Rare_101_plus     488
Rare_51_100       273
Name: count, dtype: int64
In [132]:
# Paso 1: Convertir "none" a NaN
test_values['wpt_name'] = test_values['wpt_name'].replace('none', np.nan)

# Paso 2: Contar valores nulos
total_nulls1 = test_values['wpt_name'].isnull().sum()
print(total_nulls1)
877
In [133]:
test_values
Out[133]:
id amount_tsh funder gps_height installer longitude latitude wpt_name num_private basin ... quantity quantity_group source source_type source_class waterpoint_type waterpoint_type_group day month year
0 50785 0.0 Rare_50_99 1996 Rare_menor_50_mayor_20 35.290799 -4.059696 Dinamu Secondary School 0 Internal ... seasonal seasonal rainwater harvesting rainwater harvesting surface other other 4 2 2013
1 51630 0.0 Government Of Tanzania 1569 DWE 36.656709 -3.309214 Kimnyak 0 Pangani ... insufficient insufficient spring spring groundwater communal standpipe communal standpipe 4 2 2013
2 17168 0.0 Government Of Tanzania 1567 DWE 34.767863 -5.004344 Puma Secondary 0 Internal ... insufficient insufficient rainwater harvesting rainwater harvesting surface other other 1 2 2013
3 45559 0.0 Rare_1_9 267 Rare_5_20 38.058046 -9.418672 Kwa Mzee Pange 0 Ruvuma / Southern Coast ... dry dry shallow well shallow well groundwater other other 22 1 2013
4 49871 500.0 Rare_10_49 1260 Rare_5_20 35.006123 -10.950412 Kwa Mzee Turuka 0 Ruvuma / Southern Coast ... enough enough spring spring groundwater communal standpipe communal standpipe 27 3 2013
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
14845 39307 0.0 Danida 34 Rare_50_100 38.852669 -6.582841 Kwambwezi 0 Wami / Ruvu ... enough enough river river/lake surface communal standpipe communal standpipe 24 2 2011
14846 18990 1000.0 Rare_1_9 0 Rare_menor_5 37.451633 -5.350428 Bonde La Mkondoa 0 Pangani ... insufficient insufficient shallow well shallow well groundwater hand pump hand pump 21 3 2011
14847 28749 0.0 Rare_10_49 1476 Rare_5_20 34.739804 -4.585587 Bwawani 0 Internal ... insufficient insufficient dam dam surface communal standpipe communal standpipe 4 3 2013
14848 33492 0.0 Rare_10_49 998 DWE 35.432732 -10.584159 Kwa John 0 Lake Nyasa ... insufficient insufficient river river/lake surface communal standpipe communal standpipe 18 2 2013
14849 68707 0.0 Government Of Tanzania 481 Rare_201_500 34.765054 -11.226012 Kwa Mzee Chagala 0 Lake Nyasa ... dry dry spring spring groundwater communal standpipe communal standpipe 13 2 2013

14850 rows × 42 columns

In [134]:
import pandas as pd
import numpy as np

# Supongamos que `test_values` es tu DataFrame original con la columna `wpt_name`

# Calcular las proporciones de cada categoría sobre el total de observaciones no nulas
total_non_null = test_values['wpt_name'].dropna().count()
proportions = test_values['wpt_name'].value_counts() / total_non_null

# Obtener la cantidad de valores nulos
total_nulls = test_values['wpt_name'].isnull().sum()

# Multiplicar las proporciones por el total de valores nulos
imputed_nulls = np.round(proportions * total_nulls).astype(int)

# Ajustar para asegurar que la suma sea igual al total de valores nulos
imputed_nulls_delta = total_nulls - imputed_nulls.sum()
if imputed_nulls_delta > 0:
    first_non_zero_idx = imputed_nulls[imputed_nulls > 0].index[0]
    imputed_nulls[first_non_zero_idx] += imputed_nulls_delta

# Crear una lista con las categorías según las cantidades calculadas
impute_values = []
for wpt_name, count in imputed_nulls.items():
    impute_values.extend([wpt_name] * count)

# Si hay una discrepancia, ajustar la lista impute_values
if len(impute_values) > total_nulls:
    impute_values = impute_values[:total_nulls]
elif len(impute_values) < total_nulls:
    additional_values = np.random.choice(proportions.index, size=total_nulls - len(impute_values), p=proportions.values)
    impute_values.extend(additional_values)

# Reemplazar los valores nulos en la columna 'wpt_name'
null_indices = test_values['wpt_name'][test_values['wpt_name'].isnull()].index
test_values.loc[null_indices, 'wpt_name'] = impute_values

# Mostrar los resultados
print("Total de valores nulos:", total_nulls)
print("Proporciones de cada categoría:")
print(proportions)
print("Cantidad imputada a cada categoría:")
print(imputed_nulls)
print("Valores imputados:")
print(test_values['wpt_name'].value_counts())
Total de valores nulos: 877
Proporciones de cada categoría:
wpt_name
Shuleni             0.031131
Zahanati            0.014600
Msikitini           0.008015
Sokoni              0.004867
Kanisani            0.004795
                      ...   
Kwa Gadau           0.000072
Bubinza B           0.000072
Nguvu Kazi A        0.000072
Kwa Everigist       0.000072
Kwa Mzee Chagala    0.000072
Name: count, Length: 10839, dtype: float64
Cantidad imputada a cada categoría:
wpt_name
Shuleni             765
Zahanati             13
Msikitini             7
Sokoni                4
Kanisani              4
                   ... 
Kwa Gadau             0
Bubinza B             0
Nguvu Kazi A          0
Kwa Everigist         0
Kwa Mzee Chagala      0
Name: count, Length: 10839, dtype: int32
Valores imputados:
wpt_name
Shuleni             1200
Zahanati             217
Msikitini            119
Sokoni                72
Kanisani              71
                    ... 
Kwa Gadau              1
Bubinza B              1
Nguvu Kazi A           1
Kwa Everigist          1
Kwa Mzee Chagala       1
Name: count, Length: 10839, dtype: int64
In [135]:
import pandas as pd

# Supongamos que `test_values` es tu DataFrame original con la columna `wpt_name`

# Obtener la distribución de la columna 'wpt_name'
wpt_name_counts = test_values['wpt_name'].value_counts()

# Definir los tramos de frecuencia
rare_1 = wpt_name_counts[wpt_name_counts == 1].index
rare_2_30 = wpt_name_counts[(wpt_name_counts > 1) & (wpt_name_counts <= 30)].index
rare_31_100 = wpt_name_counts[(wpt_name_counts > 30) & (wpt_name_counts <= 100)].index
rare_101_500 = wpt_name_counts[(wpt_name_counts > 100) & (wpt_name_counts <= 500)].index

# Reemplazar las categorías según los tramos definidos
test_values['wpt_name'] = test_values['wpt_name'].apply(
    lambda x: 'Rare_1' if x in rare_1 else
              ('Rare_2_30' if x in rare_2_30 else
               ('Rare_31_100' if x in rare_31_100 else
                ('Rare_101_500' if x in rare_101_500 else x))
    )
)

# Mostrar los resultados
print(test_values['wpt_name'].value_counts())
wpt_name
Rare_1          10046
Rare_2_30        2743
Shuleni          1200
Rare_31_100       525
Rare_101_500      336
Name: count, dtype: int64
In [136]:
# Obtener la distribución de la columna 'public_meeting'
public_meeting_counts = test_values['public_meeting'].value_counts()

print(public_meeting_counts)
public_meeting
True     12738
False     1291
Name: count, dtype: int64
In [137]:
# Obtener la distribución de la columna 'public_meeting'
public_meeting_counts = test_values['public_meeting'].value_counts()

# Crear una copia para evitar modificar los datos originales
public_meeting_counts_modified = public_meeting_counts.copy()

# Reemplazar True por 1 y False por 0 en la copia
public_meeting_counts_modified.index = public_meeting_counts_modified.index.map({True: 1, False: 0}.get)

# Mostrar los resultados
print(public_meeting_counts_modified)
public_meeting
1    12738
0     1291
Name: count, dtype: int64
In [138]:
# Contar valores nulos
total_nulls_public_meeting = test_values['public_meeting'].isnull().sum()
print(total_nulls_public_meeting)
821
In [139]:
# Calcular la proporción de cada categoría sobre el total de observaciones no nulas
total_non_null = test_values['public_meeting'].dropna().count()
proportions = test_values['public_meeting'].value_counts() / total_non_null

# Obtener la cantidad de valores nulos
total_nulls = test_values['public_meeting'].isnull().sum()

# Multiplicar las proporciones por el total de valores nulos
imputed_nulls = np.round(proportions * total_nulls).astype(int)

# Ajustar para asegurar que la suma sea igual al total de valores nulos
imputed_nulls_delta = total_nulls - imputed_nulls.sum()
if imputed_nulls_delta > 0:
    first_non_zero_idx = imputed_nulls[imputed_nulls > 0].index[0]
    imputed_nulls[first_non_zero_idx] += imputed_nulls_delta

# Crear una lista con las categorías según las cantidades calculadas
impute_values = []
for public_meeting, count in imputed_nulls.items():
    impute_values.extend([public_meeting] * count)

# Si hay una discrepancia, ajustar la lista impute_values
if len(impute_values) > total_nulls:
    impute_values = impute_values[:total_nulls]
elif len(impute_values) < total_nulls:
    additional_values = np.random.choice(proportions.index, size=total_nulls - len(impute_values), p=proportions.values)
    impute_values.extend(additional_values)

# Reemplazar los valores nulos en la columna 'public_meeting'
null_indices = test_values['public_meeting'][test_values['public_meeting'].isnull()].index
test_values.loc[null_indices, 'public_meeting'] = impute_values

# Mostrar los resultados
print("Total de valores nulos:", total_nulls)
print("Proporciones de cada categoría:")
print(proportions)
print("Cantidad imputada a cada categoría:")
print(imputed_nulls)
print("Valores imputados:")
print(test_values['public_meeting'].value_counts())
Total de valores nulos: 821
Proporciones de cada categoría:
public_meeting
True     0.907976
False    0.092024
Name: count, dtype: float64
Cantidad imputada a cada categoría:
public_meeting
True     745
False     76
Name: count, dtype: int32
Valores imputados:
public_meeting
True     13483
False     1367
Name: count, dtype: int64
In [140]:
import numpy as np

# Calcular la proporción de cada categoría sobre el total de observaciones no nulas
total_non_null = test_values['scheme_management'].dropna().count()
proportions = test_values['scheme_management'].value_counts() / total_non_null

# Obtener la cantidad de valores nulos
total_nulls = test_values['scheme_management'].isnull().sum()

# Multiplicar las proporciones por el total de valores nulos
imputed_nulls = np.round(proportions * total_nulls).astype(int)

# Ajustar para asegurar que la suma sea igual al total de valores nulos
imputed_nulls_delta = total_nulls - imputed_nulls.sum()
if imputed_nulls_delta > 0:
    first_non_zero_idx = imputed_nulls[imputed_nulls > 0].index[0]
    imputed_nulls[first_non_zero_idx] += imputed_nulls_delta

# Crear una lista con las categorías según las cantidades calculadas
impute_values = []
for scheme_management, count in imputed_nulls.items():
    impute_values.extend([scheme_management] * count)

# Si hay una discrepancia, ajustar la lista impute_values
if len(impute_values) > total_nulls:
    impute_values = impute_values[:total_nulls]
elif len(impute_values) < total_nulls:
    additional_values = np.random.choice(proportions.index, size=total_nulls - len(impute_values), p=proportions.values)
    impute_values.extend(additional_values)

# Reemplazar los valores nulos en la columna 'scheme_management'
null_indices = test_values['scheme_management'][test_values['scheme_management'].isnull()].index
test_values.loc[null_indices, 'scheme_management'] = impute_values

# Mostrar los resultados
print("Total de valores nulos:", total_nulls)
print("Proporciones de cada categoría:")
print(proportions)
print("Cantidad imputada a cada categoría:")
print(imputed_nulls)
print("Valores imputados:")
print(test_values['scheme_management'].value_counts())
Total de valores nulos: 969
Proporciones de cada categoría:
scheme_management
VWC                 0.657301
WUG                 0.092933
Water authority     0.059218
Water Board         0.051437
WUA                 0.048123
Parastatal          0.031986
Company             0.020171
Private operator    0.018947
Other               0.016569
SWC                 0.001873
Trust               0.001441
Name: count, dtype: float64
Cantidad imputada a cada categoría:
scheme_management
VWC                 637
WUG                  90
Water authority      57
Water Board          50
WUA                  47
Parastatal           31
Company              20
Private operator     18
Other                16
SWC                   2
Trust                 1
Name: count, dtype: int32
Valores imputados:
scheme_management
VWC                 9761
WUG                 1380
Water authority      879
Water Board          764
WUA                  715
Parastatal           475
Company              300
Private operator     281
Other                246
SWC                   28
Trust                 21
Name: count, dtype: int64
In [141]:
import numpy as np

# Calcular la proporción de cada categoría sobre el total de observaciones no nulas
total_non_null = test_values['scheme_name'].dropna().count()
proportions = test_values['scheme_name'].value_counts() / total_non_null

# Obtener la cantidad de valores nulos
total_nulls = test_values['scheme_name'].isnull().sum()

# Multiplicar las proporciones por el total de valores nulos
imputed_nulls = np.round(proportions * total_nulls).astype(int)

# Ajustar para asegurar que la suma sea igual al total de valores nulos
imputed_nulls_delta = total_nulls - imputed_nulls.sum()
if imputed_nulls_delta > 0:
    first_non_zero_idx = imputed_nulls[imputed_nulls > 0].index[0]
    imputed_nulls[first_non_zero_idx] += imputed_nulls_delta

# Crear una lista con las categorías según las cantidades calculadas
impute_values = []
for scheme_name, count in imputed_nulls.items():
    impute_values.extend([scheme_name] * count)

# Si hay una discrepancia, ajustar la lista impute_values
if len(impute_values) > total_nulls:
    impute_values = impute_values[:total_nulls]
elif len(impute_values) < total_nulls:
    additional_values = np.random.choice(proportions.index, size=total_nulls - len(impute_values), p=proportions.values)
    impute_values.extend(additional_values)

# Reemplazar los valores nulos en la columna 'scheme_name'
null_indices = test_values['scheme_name'][test_values['scheme_name'].isnull()].index
test_values.loc[null_indices, 'scheme_name'] = impute_values

# Mostrar los resultados
print("Total de valores nulos:", total_nulls)
print("Proporciones de cada categoría:")
print(proportions)
print("Cantidad imputada a cada categoría:")
print(imputed_nulls)
print("Valores imputados:")
print(test_values['scheme_name'].value_counts())
Total de valores nulos: 7242
Proporciones de cada categoría:
scheme_name
K                      0.023134
Borehole               0.020768
DANIDA                 0.013670
Chalinze wate          0.012618
M                      0.011830
                         ...   
Iseke water supply     0.000131
Dina                   0.000131
Water supplying        0.000131
Mazi                   0.000131
Mbwasa water supply    0.000131
Name: count, Length: 1788, dtype: float64
Cantidad imputada a cada categoría:
scheme_name
K                      168
Borehole               150
DANIDA                  99
Chalinze wate           91
M                       86
                      ... 
Iseke water supply       1
Dina                     1
Water supplying          1
Mazi                     1
Mbwasa water supply      1
Name: count, Length: 1788, dtype: int32
Valores imputados:
scheme_name
K                      344
Borehole               308
DANIDA                 203
Chalinze wate          187
M                      176
                      ... 
Iseke water supply       1
Dina                     1
Water supplying          1
Mazi                     1
Mbwasa water supply      1
Name: count, Length: 1788, dtype: int64
In [142]:
# Obtener la distribución de la columna 'scheme_name'
scheme_name_counts = test_values['scheme_name'].value_counts()

# Definir los tramos de frecuencia
rare_less_3 = scheme_name_counts[scheme_name_counts < 3].index
rare_3_7 = scheme_name_counts[(scheme_name_counts >= 3) & (scheme_name_counts <= 7)].index
rare_8_12 = scheme_name_counts[(scheme_name_counts >= 8) & (scheme_name_counts <= 12)].index
rare_13_16 = scheme_name_counts[(scheme_name_counts >= 13) & (scheme_name_counts <= 16)].index
rare_17_20 = scheme_name_counts[(scheme_name_counts >= 17) & (scheme_name_counts <= 20)].index
rare_21_25 = scheme_name_counts[(scheme_name_counts >= 21) & (scheme_name_counts <= 25)].index
rare_26_30 = scheme_name_counts[(scheme_name_counts >= 26) & (scheme_name_counts <= 30)].index
rare_31_35 = scheme_name_counts[(scheme_name_counts >= 31) & (scheme_name_counts <= 35)].index
rare_36_40 = scheme_name_counts[(scheme_name_counts >= 36) & (scheme_name_counts <= 40)].index
rare_41_45 = scheme_name_counts[(scheme_name_counts >= 41) & (scheme_name_counts <= 45)].index
rare_46_50 = scheme_name_counts[(scheme_name_counts >= 46) & (scheme_name_counts <= 50)].index
rare_51_55 = scheme_name_counts[(scheme_name_counts >= 51) & (scheme_name_counts <= 55)].index
rare_56_64 = scheme_name_counts[(scheme_name_counts >= 56) & (scheme_name_counts <= 64)].index
rare_65_70 = scheme_name_counts[(scheme_name_counts >= 65) & (scheme_name_counts <= 70)].index
rare_71_80 = scheme_name_counts[(scheme_name_counts >= 71) & (scheme_name_counts <= 80)].index
rare_81_90 = scheme_name_counts[(scheme_name_counts >= 81) & (scheme_name_counts <= 90)].index
rare_91_100 = scheme_name_counts[(scheme_name_counts >= 91) & (scheme_name_counts <= 100)].index
rare_101_120 = scheme_name_counts[(scheme_name_counts >= 101) & (scheme_name_counts <= 120)].index
rare_121_140 = scheme_name_counts[(scheme_name_counts >= 121) & (scheme_name_counts <= 140)].index
rare_141_160 = scheme_name_counts[(scheme_name_counts >= 141) & (scheme_name_counts <= 160)].index
rare_161_180 = scheme_name_counts[(scheme_name_counts >= 161) & (scheme_name_counts <= 180)].index
rare_181_200 = scheme_name_counts[(scheme_name_counts >= 181) & (scheme_name_counts <= 200)].index

# Reemplazar las categorías según los tramos definidos
test_values['scheme_name'] = test_values['scheme_name'].apply(lambda x: 
    'Rare_less_3' if x in rare_less_3 else
    'Rare_3_7' if x in rare_3_7 else
    'Rare_8_12' if x in rare_8_12 else
    'Rare_13_16' if x in rare_13_16 else
    'Rare_17_20' if x in rare_17_20 else
    'Rare_21_25' if x in rare_21_25 else
    'Rare_26_30' if x in rare_26_30 else
    'Rare_31_35' if x in rare_31_35 else
    'Rare_36_40' if x in rare_36_40 else
    'Rare_41_45' if x in rare_41_45 else
    'Rare_46_50' if x in rare_46_50 else
    'Rare_51_55' if x in rare_51_55 else
    'Rare_56_64' if x in rare_56_64 else
    'Rare_65_70' if x in rare_65_70 else
    'Rare_71_80' if x in rare_71_80 else
    'Rare_81_90' if x in rare_81_90 else
    'Rare_91_100' if x in rare_91_100 else
    'Rare_101_120' if x in rare_101_120 else
    'Rare_121_140' if x in rare_121_140 else
    'Rare_141_160' if x in rare_141_160 else
    'Rare_161_180' if x in rare_161_180 else
    'Rare_181_200' if x in rare_181_200 else
    x
)

# Mostrar los resultados
print(test_values['scheme_name'].value_counts())
scheme_name
Rare_8_12       3082
Rare_3_7        2556
Rare_13_16      1340
Rare_less_3     1149
Rare_21_25      1018
Rare_17_20       778
Rare_56_64       551
Rare_31_35       526
Rare_101_120     418
Rare_121_140     379
Rare_26_30       361
K                344
Borehole         308
Rare_36_40       304
Rare_41_45       260
Rare_81_90       256
DANIDA           203
Rare_181_200     187
Rare_161_180     176
Rare_141_160     146
Rare_65_70       132
Rare_51_55       106
Rare_91_100       98
Rare_46_50        94
Rare_71_80        78
Name: count, dtype: int64
In [143]:
# Obtener la distribución de la columna 'permit'
permit_counts = test_values['permit'].value_counts()

# Crear una copia para evitar modificar los datos originales
permit_counts_modified = permit_counts.copy()

# Reemplazar True por 1 y False por 0 en la copia
permit_counts_modified.index = permit_counts_modified.index.map({True: 1, False: 0}.get)

# Mostrar los resultados
print(permit_counts_modified)
permit
1    9754
0    4359
Name: count, dtype: int64
In [144]:
import numpy as np

# Calcular la proporción de cada categoría sobre el total de observaciones no nulas
total_non_null = test_values['permit'].dropna().count()
proportions = test_values['permit'].value_counts() / total_non_null

# Obtener la cantidad de valores nulos
total_nulls = test_values['permit'].isnull().sum()

# Multiplicar las proporciones por el total de valores nulos
imputed_nulls = np.round(proportions * total_nulls).astype(int)

# Ajustar para asegurar que la suma sea igual al total de valores nulos
imputed_nulls_delta = total_nulls - imputed_nulls.sum()
if imputed_nulls_delta > 0:
    first_non_zero_idx = imputed_nulls[imputed_nulls > 0].index[0]
    imputed_nulls[first_non_zero_idx] += imputed_nulls_delta

# Crear una lista con las categorías según las cantidades calculadas
impute_values = []
for permit, count in imputed_nulls.items():
    impute_values.extend([permit] * count)

# Si hay una discrepancia, ajustar la lista impute_values
if len(impute_values) > total_nulls:
    impute_values = impute_values[:total_nulls]
elif len(impute_values) < total_nulls:
    additional_values = np.random.choice(proportions.index, size=total_nulls - len(impute_values), p=proportions.values)
    impute_values.extend(additional_values)

# Reemplazar los valores nulos en la columna 'permit' de test_values
null_indices = test_values['permit'][test_values['permit'].isnull()].index
test_values.loc[null_indices, 'permit'] = impute_values

# Mostrar los resultados
print("Total de valores nulos:", total_nulls)
print("Proporciones de cada categoría:")
print(proportions)
print("Cantidad imputada a cada categoría:")
print(imputed_nulls)
print("Valores imputados:")
print(test_values['permit'].value_counts())
Total de valores nulos: 737
Proporciones de cada categoría:
permit
True     0.691136
False    0.308864
Name: count, dtype: float64
Cantidad imputada a cada categoría:
permit
True     509
False    228
Name: count, dtype: int32
Valores imputados:
permit
True     10263
False     4587
Name: count, dtype: int64
In [145]:
import pandas as pd
import numpy as np

# Calcular la proporción de cada año de construcción sobre el total de observaciones no nulas
total_non_zero = test_values['construction_year'].loc[test_values['construction_year'] != 0].count()
proportions = test_values['construction_year'].loc[test_values['construction_year'] != 0].value_counts() / total_non_zero

# Obtener la cantidad de valores 0
total_zeros = (test_values['construction_year'] == 0).sum()

# Multiplicar las proporciones por el total de valores 0
imputed_zeros = np.round(proportions * total_zeros).astype(int)

# Ajustar para asegurar que la suma sea igual al total de valores 0
imputed_zeros_delta = total_zeros - imputed_zeros.sum()
if imputed_zeros_delta > 0:
    first_non_zero_idx = imputed_zeros[imputed_zeros > 0].index[0]
    imputed_zeros[first_non_zero_idx] += imputed_zeros_delta

# Crear una lista con los años según las cantidades calculadas
impute_values = []
for year, count in imputed_zeros.items():
    impute_values.extend([year] * count)

# Si hay una discrepancia, ajustar la lista impute_values
if len(impute_values) > total_zeros:
    impute_values = impute_values[:total_zeros]
elif len(impute_values) < total_zeros:
    additional_values = np.random.choice(proportions.index, size=total_zeros - len(impute_values), p=proportions.values)
    impute_values.extend(additional_values)

# Reemplazar los valores 0 en la columna 'construction_year' de test_values
zero_indices = test_values['construction_year'][test_values['construction_year'] == 0].index
test_values.loc[zero_indices, 'construction_year'] = impute_values

# Mostrar los resultados
print("Total de valores 0:", total_zeros)
print("Proporciones de cada año:")
print(proportions)
print("Cantidad imputada a cada año:")
print(imputed_zeros)
print("Valores imputados:")
print(test_values['construction_year'].value_counts().sort_index())
Total de valores 0: 5260
Proporciones de cada año:
construction_year
2010    0.069760
2009    0.069135
2008    0.065693
2000    0.050782
2006    0.043900
2007    0.038895
2011    0.034932
2004    0.030657
2003    0.030553
1995    0.028050
2002    0.027946
2005    0.027529
2012    0.027424
1999    0.025339
1985    0.024192
1978    0.023983
1998    0.023358
1990    0.023149
1996    0.021794
1994    0.021064
1980    0.020229
1984    0.019917
1972    0.019187
1982    0.018978
1997    0.018457
1992    0.017414
2001    0.014599
1974    0.014390
1993    0.014286
1988    0.014181
1975    0.012930
1986    0.012409
1976    0.011575
1983    0.011053
1991    0.008655
1970    0.008551
1989    0.008342
1987    0.007091
1981    0.005527
1979    0.005527
1977    0.004692
1973    0.004484
2013    0.003441
1971    0.003337
1960    0.002294
1963    0.002294
1969    0.001877
1967    0.001877
1968    0.001668
1964    0.000834
1961    0.000730
1962    0.000626
1965    0.000209
1966    0.000209
Name: count, dtype: float64
Cantidad imputada a cada año:
construction_year
2010    367
2009    364
2008    346
2000    267
2006    231
2007    205
2011    184
2004    161
2003    161
1995    148
2002    147
2005    145
2012    144
1999    133
1985    127
1978    126
1998    123
1990    122
1996    115
1994    111
1980    106
1984    105
1972    101
1982    100
1997     97
1992     92
2001     77
1974     76
1993     75
1988     75
1975     68
1986     65
1976     61
1983     58
1991     46
1970     45
1989     44
1987     37
1981     29
1979     29
1977     25
1973     24
2013     18
1971     18
1960     12
1963     12
1969     10
1967     10
1968      9
1964      4
1961      4
1962      3
1965      1
1966      1
Name: count, dtype: int32
Valores imputados:
construction_year
1960      34
1961      11
1962       7
1963      34
1964      12
1965       2
1966       2
1967      28
1968      25
1969      28
1970     127
1971      50
1972     285
1973      67
1974     214
1975     192
1976     172
1977      70
1978     356
1979      82
1980     300
1981      82
1982     282
1983     164
1984     296
1985     359
1986     184
1987     105
1988     211
1989     124
1990     344
1991     129
1992     259
1993     212
1994     313
1995     417
1996     324
1997     274
1998     347
1999     376
2000     754
2001     217
2002     415
2003     454
2004     455
2005     409
2006     652
2007     578
2008     976
2009    1027
2010    1036
2011     519
2012     407
2013      51
Name: count, dtype: int64
In [146]:
import pandas as pd
import matplotlib.pyplot as plt

# Visualizar la distribución de population en test_values
plt.hist(test_values['population'], bins=30, color='skyblue', edgecolor='black')
plt.title('Histograma de population en test_values')
plt.xlabel('Población')
plt.ylabel('Frecuencia')
plt.show()

# Boxplot para visualizar outliers en test_values
plt.boxplot(test_values['population'], vert=False)
plt.title('Boxplot de population en test_values')
plt.xlabel('Población')
plt.show()

# Ver las estadísticas descriptivas de population en test_values
print(test_values['population'].describe())

# Definir los umbrales basados en cuantiles
low_threshold = test_values['population'].quantile(0.25)
medium_threshold = test_values['population'].quantile(0.75)

print(f'Low threshold: {low_threshold}')
print(f'Medium threshold: {medium_threshold}')

# Función para convertir la población en categorías
def categorize_population(pop):
    if pop == 0:
        return 'No population'
    elif pop <= low_threshold:
        return 'Low population'
    elif pop <= medium_threshold:
        return 'Medium population'
    else:
        return 'High population'

# Aplicar la función al DataFrame test_values
test_values['population_category'] = test_values['population'].apply(categorize_population)

# Verificar la nueva columna en test_values
print(test_values['population_category'].value_counts())

# Visualizar la distribución de la nueva variable categórica en test_values
test_values['population_category'].value_counts().plot(kind='bar', color='skyblue', edgecolor='black')
plt.title('Distribución de categorías de población en test_values')
plt.xlabel('Categoría de población')
plt.ylabel('Frecuencia')
plt.show()
No description has been provided for this image
No description has been provided for this image
count    14850.000000
mean       184.114209
std        469.499332
min          0.000000
25%          0.000000
50%         20.000000
75%        220.000000
max      11469.000000
Name: population, dtype: float64
Low threshold: 0.0
Medium threshold: 220.0
population_category
Medium population    5694
No population        5453
High population      3703
Name: count, dtype: int64
No description has been provided for this image
In [147]:
test_values.drop('population', axis=1, inplace=True)
In [148]:
import pandas as pd

# Definir la función para categorizar amount_tsh
def categorize_amount_tsh(amount):
    if amount == 0:
        return 'No water point'
    elif amount <= 50:
        return 'Low water availability'
    elif amount <= 500:
        return 'Medium water availability'
    else:
        return 'High water availability'

# Aplicar la función al DataFrame test_values para crear una nueva columna categorizada
test_values['amount_tsh_category'] = test_values['amount_tsh'].apply(categorize_amount_tsh)

# Eliminar la columna original amount_tsh en test_values
test_values.drop('amount_tsh', axis=1, inplace=True)

# Verificar que la columna original haya sido eliminada en test_values
print(test_values.head())
      id                  funder  gps_height               installer  \
0  50785              Rare_50_99        1996  Rare_menor_50_mayor_20   
1  51630  Government Of Tanzania        1569                     DWE   
2  17168  Government Of Tanzania        1567                     DWE   
3  45559                Rare_1_9         267               Rare_5_20   
4  49871              Rare_10_49        1260               Rare_5_20   

   longitude   latitude wpt_name  num_private                    basin  \
0  35.290799  -4.059696   Rare_1            0                 Internal   
1  36.656709  -3.309214   Rare_1            0                  Pangani   
2  34.767863  -5.004344   Rare_1            0                 Internal   
3  38.058046  -9.418672   Rare_1            0  Ruvuma / Southern Coast   
4  35.006123 -10.950412   Rare_1            0  Ruvuma / Southern Coast   

  subvillage  ...                source           source_type  source_class  \
0   Rare_2_4  ...  rainwater harvesting  rainwater harvesting       surface   
1     Rare_1  ...                spring                spring   groundwater   
2     Rare_1  ...  rainwater harvesting  rainwater harvesting       surface   
3   Rare_2_4  ...          shallow well          shallow well   groundwater   
4   Rare_2_4  ...                spring                spring   groundwater   

      waterpoint_type waterpoint_type_group day month  year  \
0               other                 other   4     2  2013   
1  communal standpipe    communal standpipe   4     2  2013   
2               other                 other   1     2  2013   
3               other                 other  22     1  2013   
4  communal standpipe    communal standpipe  27     3  2013   

  population_category        amount_tsh_category  
0     High population             No water point  
1     High population             No water point  
2     High population             No water point  
3     High population             No water point  
4   Medium population  Medium water availability  

[5 rows x 42 columns]
In [149]:
test_values
Out[149]:
id funder gps_height installer longitude latitude wpt_name num_private basin subvillage ... source source_type source_class waterpoint_type waterpoint_type_group day month year population_category amount_tsh_category
0 50785 Rare_50_99 1996 Rare_menor_50_mayor_20 35.290799 -4.059696 Rare_1 0 Internal Rare_2_4 ... rainwater harvesting rainwater harvesting surface other other 4 2 2013 High population No water point
1 51630 Government Of Tanzania 1569 DWE 36.656709 -3.309214 Rare_1 0 Pangani Rare_1 ... spring spring groundwater communal standpipe communal standpipe 4 2 2013 High population No water point
2 17168 Government Of Tanzania 1567 DWE 34.767863 -5.004344 Rare_1 0 Internal Rare_1 ... rainwater harvesting rainwater harvesting surface other other 1 2 2013 High population No water point
3 45559 Rare_1_9 267 Rare_5_20 38.058046 -9.418672 Rare_1 0 Ruvuma / Southern Coast Rare_2_4 ... shallow well shallow well groundwater other other 22 1 2013 High population No water point
4 49871 Rare_10_49 1260 Rare_5_20 35.006123 -10.950412 Rare_1 0 Ruvuma / Southern Coast Rare_2_4 ... spring spring groundwater communal standpipe communal standpipe 27 3 2013 Medium population Medium water availability
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
14845 39307 Danida 34 Rare_50_100 38.852669 -6.582841 Rare_1 0 Wami / Ruvu Rare_2_4 ... river river/lake surface communal standpipe communal standpipe 24 2 2011 Medium population No water point
14846 18990 Rare_1_9 0 Rare_menor_5 37.451633 -5.350428 Rare_1 0 Pangani Rare_2_4 ... shallow well shallow well groundwater hand pump hand pump 21 3 2011 High population High water availability
14847 28749 Rare_10_49 1476 Rare_5_20 34.739804 -4.585587 Rare_2_30 0 Internal Rare_5_10 ... dam dam surface communal standpipe communal standpipe 4 3 2013 Medium population No water point
14848 33492 Rare_10_49 998 DWE 35.432732 -10.584159 Rare_2_30 0 Lake Nyasa Rare_2_4 ... river river/lake surface communal standpipe communal standpipe 18 2 2013 Medium population No water point
14849 68707 Government Of Tanzania 481 Rare_201_500 34.765054 -11.226012 Rare_1 0 Lake Nyasa Rare_1 ... spring spring groundwater communal standpipe communal standpipe 13 2 2013 Medium population No water point

14850 rows × 42 columns

In [150]:
variables_dummies = ['funder','installer','basin','wpt_name','basin','subvillage','region','lga','ward','scheme_management','recorded_by','scheme_name','extraction_type','extraction_type_group','extraction_type_class','management','management_group','payment','payment_type','water_quality','quality_group','quantity','quantity_group','source','source_type','source_class','waterpoint_type','waterpoint_type_group','status_group','population_category','amount_tsh_category','public_meeting','permit']
In [151]:
import pickle
import pandas as pd


# Nombre del archivo donde se guardó el diccionario mapeo_inverso
nombre_archivo = 'mapeo_inverso.pkl'

# Lista de variables categóricas donde aplicar el mapeo inverso
variables_categoricas = ['funder', 'installer', 'basin', 'wpt_name', 'subvillage', 'region', 
                         'lga', 'ward', 'scheme_management', 'recorded_by', 'scheme_name',
                         'extraction_type', 'extraction_type_group', 'extraction_type_class',
                         'management', 'management_group', 'payment', 'payment_type',
                         'water_quality', 'quality_group', 'quantity', 'quantity_group',
                         'source', 'source_type', 'source_class', 'waterpoint_type',
                         'waterpoint_type_group', 'population_category', 'amount_tsh_category',
                         'public_meeting', 'permit']

# Iterar sobre las variables categóricas y aplicar el mapeo inverso para test_values
for var in variables_categoricas:
    # Cargar el diccionario mapeo_inverso desde el archivo guardado
    with open(nombre_archivo, 'rb') as f:
        mapeo_inverso_train = pickle.load(f)
    
    # Aplicar el mapeo inverso para test_values
    if var in mapeo_inverso_train:
        mapeo = mapeo_inverso_train[var]
        mapeo_nan = {v: k for k, v in mapeo.items()}  # Mapeo con NaN manejados
        test_values.loc[:, var] = test_values[var].map(mapeo_nan).fillna('Desconocido')  # Asignar 'Desconocido' a NaN
    else:
        print(f"El diccionario mapeo_inverso no contiene la clave '{var}'")

# Verificar el resultado para test_values después de aplicar el mapeo inverso
print(test_values.head())
      id funder  gps_height installer  longitude   latitude wpt_name  \
0  50785      2        1996         2  35.290799  -4.059696        4   
1  51630     16        1569         5  36.656709  -3.309214        4   
2  17168     16        1567         5  34.767863  -5.004344        4   
3  45559      3         267         8  38.058046  -9.418672        4   
4  49871      7        1260         8  35.006123 -10.950412        4   

   num_private basin subvillage  ... source  source_type  source_class  \
0            0     4          3  ...      2            2             2   
1            0     3          4  ...      1            1             1   
2            0     4          4  ...      2            2             2   
3            0     9          3  ...      5            5             1   
4            0     9          3  ...      1            1             1   

  waterpoint_type waterpoint_type_group day month  year population_category  \
0               4                     3   4     2  2013                   2   
1               1                     1   4     2  2013                   2   
2               4                     3   1     2  2013                   2   
3               4                     3  22     1  2013                   2   
4               1                     1  27     3  2013                   1   

  amount_tsh_category  
0                   2  
1                   2  
2                   2  
3                   2  
4                   4  

[5 rows x 42 columns]
In [152]:
test_values.set_index('id', inplace=True)
In [153]:
# Lista de columnas que deseas conservar
columnas_deseadas = ['funder', 'gps_height', 'installer', 'longitude', 'latitude',
                     'wpt_name', 'basin', 'population_category', 'district_code', 'public_meeting', 
                     'scheme_management', 'permit', 'construction_year', 'extraction_type', 
                     'amount_tsh_category', 'management', 'payment', 'water_quality', 'quantity', 
                     'source', 'waterpoint_type', 'day', 'month', 'year']

# Seleccionar solo las columnas deseadas en tu DataFrame test_values
test_values = test_values.loc[:, columnas_deseadas]

# Verificar que se hayan seleccionado correctamente las columnas
print(test_values.head())
      funder  gps_height installer  longitude   latitude wpt_name basin  \
id                                                                        
50785      2        1996         2  35.290799  -4.059696        4     4   
51630     16        1569         5  36.656709  -3.309214        4     3   
17168     16        1567         5  34.767863  -5.004344        4     4   
45559      3         267         8  38.058046  -9.418672        4     9   
49871      7        1260         8  35.006123 -10.950412        4     9   

      population_category  district_code public_meeting  ...  \
id                                                       ...   
50785                   2              3              1  ...   
51630                   2              2              1  ...   
17168                   2              2              1  ...   
45559                   2             43              1  ...   
49871                   1              3              1  ...   

      amount_tsh_category management  payment water_quality quantity source  \
id                                                                            
50785                   2          9        2             1        3      2   
51630                   2          1        2             1        2      1   
17168                   2          1        2             1        2      2   
45559                   2          1        4             1        4      5   
49871                   4          4        7             1        1      1   

      waterpoint_type day month  year  
id                                     
50785               4   4     2  2013  
51630               1   4     2  2013  
17168               4   1     2  2013  
45559               4  22     1  2013  
49871               1  27     3  2013  

[5 rows x 24 columns]
In [154]:
test_values.transpose()
Out[154]:
id 50785 51630 17168 45559 49871 52449 24806 28965 36301 54122 ... 59757 64579 57731 65541 68174 39307 18990 28749 33492 68707
funder 2 16 16 3 7 16 16 2 12 2 ... 7 4 2 2 34 8 3 7 7 16
gps_height 1996 1569 1567 267 1260 1685 550 234 584 1083 ... 1291 0 808 1641 0 34 0 1476 998 481
installer 2 5 5 8 8 5 4 1 1 1 ... 1 5 5 2 5 1 15 8 5 3
longitude 35.290799 36.656709 34.767863 38.058046 35.006123 36.685279 36.398041 39.60742 39.262951 37.096108 ... 35.345384 0.0 29.740224 29.768139 34.096878 38.852669 37.451633 34.739804 35.432732 34.765054
latitude -4.059696 -3.309214 -5.004344 -9.418672 -10.950412 -3.30242 -7.541382 -10.893786 -10.823588 -3.251754 ... -9.83117 -0.0 -4.882705 -4.480618 -3.079689 -6.582841 -5.350428 -4.585587 -10.584159 -11.226012
wpt_name 4 4 4 4 4 4 1 4 4 4 ... 4 4 3 4 4 4 4 3 3 4
basin 4 3 4 9 9 3 6 9 9 3 ... 1 2 5 5 2 7 3 4 1 1
population_category 2 2 2 2 1 1 2 1 1 1 ... 3 3 2 2 3 1 2 1 1 1
district_code 3 2 2 43 3 2 7 4 33 7 ... 2 1 3 2 2 1 7 2 2 3
public_meeting 1 1 1 1 1 1 1 1 1 1 ... 1 2 1 1 1 1 1 1 1 1
scheme_management 9 1 1 1 4 1 1 4 1 4 ... 1 3 3 6 3 1 1 1 1 1
permit 2 2 2 2 2 2 2 2 1 2 ... 2 1 2 1 1 2 1 1 2 2
construction_year 2012 2000 2010 1987 2000 1990 2007 1982 1997 2003 ... 2009 1961 2009 1995 1962 1988 1994 2010 2009 2008
extraction_type 5 1 5 5 1 1 4 7 1 1 ... 1 2 1 5 3 8 3 1 1 1
amount_tsh_category 2 2 2 2 4 2 2 2 3 2 ... 2 2 1 2 2 2 1 2 2 2
management 9 1 1 1 4 1 1 1 1 4 ... 1 2 1 1 2 1 1 1 1 1
payment 2 2 2 4 7 2 2 2 3 7 ... 2 4 7 2 6 2 1 2 2 2
water_quality 1 1 1 1 1 1 3 1 1 1 ... 1 1 1 1 1 1 3 1 1 1
quantity 3 2 2 4 1 1 1 4 2 1 ... 1 1 1 1 1 1 2 2 2 4
source 2 1 2 5 1 1 4 4 1 1 ... 6 5 6 1 5 6 5 3 6 1
waterpoint_type 4 1 4 4 1 1 3 2 1 1 ... 1 3 2 4 3 1 3 1 1 1
day 4 4 1 22 27 4 2 25 23 18 ... 24 26 27 4 7 24 21 4 18 13
month 2 2 2 1 3 3 3 1 1 3 ... 2 10 1 2 11 2 3 3 2 2
year 2013 2013 2013 2013 2013 2013 2011 2013 2013 2013 ... 2013 2012 2013 2013 2012 2011 2011 2013 2013 2013

24 rows × 14850 columns

In [155]:
# Hacer predicciones usando el modelo entrenado
predictions = classifier5.predict_proba(test_values)
In [156]:
predictions
Out[156]:
array([[0.42632192, 0.47347173, 0.10020635],
       [0.57655141, 0.20815819, 0.2152904 ],
       [0.6675771 , 0.25196104, 0.08046186],
       ...,
       [0.36171236, 0.17498557, 0.46330207],
       [0.61131091, 0.10699484, 0.28169425],
       [0.01888385, 0.97703282, 0.00408333]])
In [157]:
from pycaret.classification import setup, predict_model

# Configura el entorno y prepara los datos para el modelo
setup(data=train_cleaned, target='status_group')

# Realiza las predicciones usando el modelo entrenado
predictions = predict_model(classifier5, data=test_values)
  Description Value
0 Session id 984
1 Target status_group
2 Target type Multiclass
3 Original data shape (51249, 25)
4 Transformed data shape (51249, 25)
5 Transformed train set shape (35874, 25)
6 Transformed test set shape (15375, 25)
7 Numeric features 24
8 Preprocess True
9 Imputation type simple
10 Numeric imputation mean
11 Categorical imputation mode
12 Fold Generator StratifiedKFold
13 Fold Number 10
14 CPU Jobs -1
15 Use GPU False
16 Log Experiment False
17 Experiment Name clf-default-name
18 USI c550
In [158]:
predictions
Out[158]:
funder gps_height installer longitude latitude wpt_name basin population_category district_code public_meeting ... payment water_quality quantity source waterpoint_type day month year prediction_label prediction_score
id
50785 2 1996 2 35.290798 -4.059696 4 4 2 3 1 ... 2 1 3 2 4 4 2 2013 1 0.4735
51630 16 1569 5 36.656708 -3.309214 4 3 2 2 1 ... 2 1 2 1 1 4 2 2013 0 0.5766
17168 16 1567 5 34.767864 -5.004344 4 4 2 2 1 ... 2 1 2 2 4 1 2 2013 0 0.6676
45559 3 267 8 38.058044 -9.418673 4 9 2 43 1 ... 4 1 4 5 4 22 1 2013 1 0.8979
49871 7 1260 8 35.006123 -10.950412 4 9 1 3 1 ... 7 1 1 1 1 27 3 2013 0 0.9259
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
39307 8 34 1 38.852669 -6.582841 4 7 1 1 1 ... 2 1 1 6 1 24 2 2011 1 0.9181
18990 3 0 15 37.451633 -5.350428 4 3 2 7 1 ... 1 3 2 5 3 21 3 2011 0 0.6254
28749 7 1476 8 34.739803 -4.585587 3 4 1 2 1 ... 2 1 2 3 1 4 3 2013 2 0.4633
33492 7 998 5 35.432732 -10.584159 3 1 1 2 1 ... 2 1 2 6 1 18 2 2013 0 0.6113
68707 16 481 3 34.765053 -11.226012 4 1 1 3 1 ... 2 1 4 1 1 13 2 2013 1 0.9770

14850 rows × 26 columns

In [159]:
predictions.rename(columns={'prediction_label': 'status_group'}, inplace=True)
In [160]:
# Resetear el índice para que 'id' sea una columna normal
predictions.reset_index(inplace=True)

# Renombrar la columna de predicción si no se ha hecho ya
predictions.rename(columns={'prediction_label': 'status_group'}, inplace=True)

# Definir el mapeo de valores
status_mapping = {
    0: 'functional',  
    1: 'non functional',
    2: 'functional needs repair'
}

# Convertir la columna 'status_group' a int antes de aplicar el mapeo
predictions['status_group'] = predictions['status_group'].astype(int)

# Aplicar el mapeo a la columna 'status_group'
predictions['status_group'] = predictions['status_group'].replace(status_mapping)

# Convertir la columna 'id' a string
predictions['id'] = predictions['id'].astype(str)

# Guardar las columnas 'id' y 'status_group' en un archivo CSV
predictions[['id', 'status_group']].to_csv('submission.csv', index=False)

# Verificar los cambios
print(predictions[['id', 'status_group']].head())
      id    status_group
0  50785  non functional
1  51630      functional
2  17168      functional
3  45559  non functional
4  49871      functional
In [161]:
predictions[['id', 'status_group']]
Out[161]:
id status_group
0 50785 non functional
1 51630 functional
2 17168 functional
3 45559 non functional
4 49871 functional
... ... ...
14845 39307 non functional
14846 18990 functional
14847 28749 functional needs repair
14848 33492 functional
14849 68707 non functional

14850 rows × 2 columns

In [162]:
from IPython.display import FileLink
FileLink('submission.csv')
Out[162]:
submission.csv
In [163]:
selected_features = ['population_category', 'amount_tsh_category', 'gps_height', 'funder', 'installer',
                     'year','day','month', 'scheme_management', 'construction_year', 'extraction_type',
                     'management', 'payment', 'water_quality', 'quantity', 'source', 'waterpoint_type', 'status_group']

train_data_selected = train_cleaned[selected_features]

clf_setup = setup(data=train_data_selected, target='status_group', train_size=0.8)
  Description Value
0 Session id 8877
1 Target status_group
2 Target type Multiclass
3 Original data shape (51249, 18)
4 Transformed data shape (51249, 18)
5 Transformed train set shape (40999, 18)
6 Transformed test set shape (10250, 18)
7 Numeric features 17
8 Preprocess True
9 Imputation type simple
10 Numeric imputation mean
11 Categorical imputation mode
12 Fold Generator StratifiedKFold
13 Fold Number 10
14 CPU Jobs -1
15 Use GPU False
16 Log Experiment False
17 Experiment Name clf-default-name
18 USI 314f
In [164]:
from pycaret.classification import setup, compare_models, predict_model

best_model = compare_models()
  Model Accuracy AUC Recall Prec. F1 Kappa MCC TT (Sec)
xgboost Extreme Gradient Boosting 0.7860 0.8857 0.7860 0.7811 0.7716 0.5839 0.5940 0.1890
rf Random Forest Classifier 0.7822 0.8828 0.7822 0.7734 0.7749 0.5878 0.5907 0.3550
lightgbm Light Gradient Boosting Machine 0.7772 0.8781 0.7772 0.7785 0.7598 0.5616 0.5779 0.3110
et Extra Trees Classifier 0.7758 0.8613 0.7758 0.7684 0.7708 0.5801 0.5814 0.3130
gbc Gradient Boosting Classifier 0.7459 0.0000 0.7459 0.7511 0.7192 0.4901 0.5160 1.1390
dt Decision Tree Classifier 0.7257 0.7559 0.7257 0.7261 0.7258 0.4981 0.4982 0.0320
ada Ada Boost Classifier 0.7233 0.0000 0.7233 0.7109 0.6925 0.4434 0.4669 0.1230
qda Quadratic Discriminant Analysis 0.6645 0.0000 0.6645 0.6482 0.6534 0.3609 0.3632 0.0150
lda Linear Discriminant Analysis 0.6629 0.0000 0.6629 0.6174 0.6293 0.3209 0.3374 0.0190
ridge Ridge Classifier 0.6621 0.0000 0.6621 0.6164 0.6287 0.3198 0.3358 0.0140
lr Logistic Regression 0.6608 0.0000 0.6608 0.6138 0.6289 0.3202 0.3335 0.6980
knn K Neighbors Classifier 0.6584 0.7353 0.6584 0.6424 0.6397 0.3318 0.3392 0.2750
nb Naive Bayes 0.6475 0.7304 0.6475 0.6104 0.6230 0.3117 0.3158 0.0140
svm SVM - Linear Kernel 0.5546 0.0000 0.5546 0.6331 0.4892 0.1777 0.2201 0.4660
dummy Dummy Classifier 0.5544 0.5000 0.5544 0.3074 0.3955 0.0000 0.0000 0.0110
In [165]:
from pycaret.classification import finalize_model

final_model = finalize_model(best_model)
In [166]:
predictions = predict_model(final_model, data=test_values)
In [167]:
predictions
Out[167]:
funder gps_height installer longitude latitude wpt_name basin population_category district_code public_meeting ... payment water_quality quantity source waterpoint_type day month year prediction_label prediction_score
id
50785 2 1996 2 35.290798 -4.059696 4 4 2 3 1 ... 2 1 3 2 4 4 2 2013 1 0.6371
51630 16 1569 5 36.656708 -3.309214 4 3 2 2 1 ... 2 1 2 1 1 4 2 2013 0 0.7473
17168 16 1567 5 34.767864 -5.004344 4 4 2 2 1 ... 2 1 2 2 4 1 2 2013 0 0.8227
45559 3 267 8 38.058044 -9.418673 4 9 2 43 1 ... 4 1 4 5 4 22 1 2013 1 0.9962
49871 7 1260 8 35.006123 -10.950412 4 9 1 3 1 ... 7 1 1 1 1 27 3 2013 0 0.9568
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
39307 8 34 1 38.852669 -6.582841 4 7 1 1 1 ... 2 1 1 6 1 24 2 2011 1 0.8346
18990 3 0 15 37.451633 -5.350428 4 3 2 7 1 ... 1 3 2 5 3 21 3 2011 0 0.9323
28749 7 1476 8 34.739803 -4.585587 3 4 1 2 1 ... 2 1 2 3 1 4 3 2013 0 0.8035
33492 7 998 5 35.432732 -10.584159 3 1 1 2 1 ... 2 1 2 6 1 18 2 2013 0 0.6091
68707 16 481 3 34.765053 -11.226012 4 1 1 3 1 ... 2 1 4 1 1 13 2 2013 1 0.9878

14850 rows × 26 columns

In [168]:
# Resetear el índice para que 'id' sea una columna normal
predictions.reset_index(inplace=True)

# Renombrar la columna de predicción si no se ha hecho ya
predictions.rename(columns={'prediction_label': 'status_group'}, inplace=True)

# Definir el mapeo de valores
status_mapping = {
    0: 'functional',  
    1: 'non functional',
    2: 'functional needs repair'
}

# Convertir la columna 'status_group' a int antes de aplicar el mapeo
predictions['status_group'] = predictions['status_group'].astype(int)

# Aplicar el mapeo a la columna 'status_group'
predictions['status_group'] = predictions['status_group'].replace(status_mapping)

# Convertir la columna 'id' a string
predictions['id'] = predictions['id'].astype(str)

# Guardar las columnas 'id' y 'status_group' en un archivo CSV
predictions[['id', 'status_group']].to_csv('submission.csv', index=False)

# Verificar los cambios
print(predictions[['id', 'status_group']].head())
      id    status_group
0  50785  non functional
1  51630      functional
2  17168      functional
3  45559  non functional
4  49871      functional
In [169]:
from IPython.display import FileLink
FileLink('submission.csv')
Out[169]:
submission.csv

Finalmente el modelo óptimo ha sido Extreme Gradient Boosting, con un puntaje de 0.7802

In [ ]: